diff --git a/integrations/astra/pyproject.toml b/integrations/astra/pyproject.toml index 152d93a486..b1325354ce 100644 --- a/integrations/astra/pyproject.toml +++ b/integrations/astra/pyproject.toml @@ -22,7 +22,12 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "pydantic", "typing_extensions", "astrapy>=1.5.0,<2.0"] +dependencies = [ + "astrapy>=1.5.0,<2.0", + "haystack-ai>=2.24.0", + "pydantic", + "typing_extensions", +] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/astra#readme" diff --git a/integrations/astra/tests/test_document_store.py b/integrations/astra/tests/test_document_store.py index 3d2695d543..5235c3fae5 100644 --- a/integrations/astra/tests/test_document_store.py +++ b/integrations/astra/tests/test_document_store.py @@ -10,7 +10,7 @@ from haystack import Document from haystack.document_stores.errors import MissingDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack.testing.document_store import DocumentStoreBaseTests +from haystack.testing.document_store import DocumentStoreBaseExtendedTests from haystack_integrations.document_stores.astra import AstraDocumentStore @@ -48,9 +48,9 @@ def test_to_dict(mock_auth): # noqa os.environ.get("ASTRA_DB_APPLICATION_TOKEN", "") == "", reason="ASTRA_DB_APPLICATION_TOKEN env var not set" ) @pytest.mark.skipif(os.environ.get("ASTRA_DB_API_ENDPOINT", "") == "", reason="ASTRA_DB_API_ENDPOINT env var not set") -class TestDocumentStore(DocumentStoreBaseTests): +class TestDocumentStore(DocumentStoreBaseExtendedTests): """ - Common test cases will be provided by `DocumentStoreBaseTests` but + Common test cases will be provided by `DocumentStoreBaseExtendedTests` but you can add more to this class. """ @@ -204,74 +204,6 @@ def test_filter_documents_by_in_operator(self, document_store): self.assert_documents_are_equal([result[0]], [docs[0]]) self.assert_documents_are_equal([result[1]], [docs[1]]) - def test_delete_all_documents(self, document_store: AstraDocumentStore): - """ - Test delete_all_documents() on an Astra. - """ - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - - def test_delete_by_filter(self, document_store: AstraDocumentStore, filterable_docs): - document_store.write_documents(filterable_docs) - initial_count = document_store.count_documents() - assert initial_count > 0 - - # count documents that match the filter before deletion - matching_docs = [d for d in filterable_docs if d.meta.get("chapter") == "intro"] - expected_deleted_count = len(matching_docs) - - # delete all documents with chapter="intro" - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.chapter", "operator": "==", "value": "intro"} - ) - - assert deleted_count == expected_deleted_count - assert document_store.count_documents() == initial_count - deleted_count - - # remaining documents don't have chapter="intro" - remaining_docs = document_store.filter_documents() - for doc in remaining_docs: - assert doc.meta.get("chapter") != "intro" - - # all documents with chapter="intro" were deleted - intro_docs = document_store.filter_documents( - filters={"field": "meta.chapter", "operator": "==", "value": "intro"} - ) - assert len(intro_docs) == 0 - - def test_update_by_filter(self, document_store: AstraDocumentStore, filterable_docs): - document_store.write_documents(filterable_docs) - initial_count = document_store.count_documents() - assert initial_count > 0 - - # count documents that match the filter before update - matching_docs = [d for d in filterable_docs if d.meta.get("chapter") == "intro"] - expected_updated_count = len(matching_docs) - - # update all documents with chapter="intro" to have status="updated" - updated_count = document_store.update_by_filter( - filters={"field": "meta.chapter", "operator": "==", "value": "intro"}, - meta={"status": "updated"}, - ) - - assert updated_count == expected_updated_count - assert document_store.count_documents() == initial_count - - # verify the updated documents have the new metadata - updated_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "updated"} - ) - assert len(updated_docs) == expected_updated_count - for doc in updated_docs: - assert doc.meta.get("chapter") == "intro" - assert doc.meta.get("status") == "updated" - - # verify other documents weren't affected - all_docs = document_store.filter_documents() - for doc in all_docs: - if doc.meta.get("chapter") != "intro": - assert doc.meta.get("status") != "updated" - @pytest.mark.skip(reason="Unsupported filter operator not.") def test_not_operator(self, document_store, filterable_docs): pass diff --git a/integrations/azure_ai_search/pyproject.toml b/integrations/azure_ai_search/pyproject.toml index 740ee4d376..57fa4f6963 100644 --- a/integrations/azure_ai_search/pyproject.toml +++ b/integrations/azure_ai_search/pyproject.toml @@ -22,7 +22,11 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "azure-search-documents>=11.5", "azure-identity"] +dependencies = [ + "haystack-ai>=2.24.0", + "azure-search-documents>=11.5", + "azure-identity" +] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/azure_ai_search#readme" diff --git a/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/filters.py b/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/filters.py index 8dfa854f34..da34e6396b 100644 --- a/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/filters.py +++ b/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/filters.py @@ -68,11 +68,19 @@ def _parse_comparison_condition(condition: dict[str, Any]) -> str: def _eq(field: str, value: Any) -> str: - return f"{field} eq '{value}'" if isinstance(value, str) and value != "null" else f"{field} eq {value}" + if isinstance(value, str) and value != "null": + return f"{field} eq '{value}'" + if isinstance(value, bool): + return f"{field} eq {str(value).lower()}" + return f"{field} eq {value}" def _ne(field: str, value: Any) -> str: - return f"not ({field} eq '{value}')" if isinstance(value, str) and value != "null" else f"not ({field} eq {value})" + if isinstance(value, str) and value != "null": + return f"not ({field} eq '{value}')" + if isinstance(value, bool): + return f"not ({field} eq {str(value).lower()})" + return f"not ({field} eq {value})" def _in(field: str, value: Any) -> str: diff --git a/integrations/azure_ai_search/tests/conftest.py b/integrations/azure_ai_search/tests/conftest.py index 68d654e4d3..f77c9cde5b 100644 --- a/integrations/azure_ai_search/tests/conftest.py +++ b/integrations/azure_ai_search/tests/conftest.py @@ -70,8 +70,9 @@ def delete_by_filter_and_wait(filters): time.sleep(SLEEP_TIME_IN_SECONDS) return deleted_count - def update_by_filter_and_wait(filters, fields): - updated_count = original_update_by_filter(filters, fields) + def update_by_filter_and_wait(filters, meta=None, fields=None): + updates = meta if meta is not None else fields or {} + updated_count = original_update_by_filter(filters, updates) time.sleep(SLEEP_TIME_IN_SECONDS) return updated_count diff --git a/integrations/azure_ai_search/tests/test_document_store.py b/integrations/azure_ai_search/tests/test_document_store.py index 069f5bb99b..31753f3e52 100644 --- a/integrations/azure_ai_search/tests/test_document_store.py +++ b/integrations/azure_ai_search/tests/test_document_store.py @@ -13,8 +13,12 @@ from haystack.errors import FilterError from haystack.testing.document_store import ( CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, DeleteDocumentsTest, + FilterableDocsFixtureMixin, FilterDocumentsTest, + UpdateByFilterTest, WriteDocumentsTest, ) from haystack.utils.auth import EnvVarSecret, Secret @@ -256,7 +260,15 @@ def _assert_documents_are_equal(received: list[Document], expected: list[Documen not os.environ.get("AZURE_AI_SEARCH_ENDPOINT", None) and not os.environ.get("AZURE_AI_SEARCH_API_KEY", None), reason="Missing AZURE_AI_SEARCH_ENDPOINT or AZURE_AI_SEARCH_API_KEY.", ) -class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): +class TestDocumentStore( + CountDocumentsTest, + DeleteDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, + FilterableDocsFixtureMixin, + WriteDocumentsTest, + UpdateByFilterTest, +): def assert_documents_are_equal(self, received: list[Document], expected: list[Document]): _assert_documents_are_equal(received, expected) @@ -290,43 +302,14 @@ def test_write_documents_duplicate_fail(self, document_store: AzureAISearchDocum @pytest.mark.skip(reason="Azure AI search index overwrites duplicate documents by default") def test_write_documents_duplicate_skip(self, document_store: AzureAISearchDocumentStore): ... - def test_delete_all_documents(self, document_store: AzureAISearchDocumentStore): - docs = [Document(content="first doc"), Document(content="second doc")] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - - def test_delete_all_documents_empty_index(self, document_store: AzureAISearchDocumentStore): - assert document_store.count_documents() == 0 - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - @pytest.mark.parametrize( "document_store", [{"metadata_fields": {"category": str}}], indirect=True, ) def test_delete_by_filter(self, document_store: AzureAISearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Delete documents with category="A" - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - # Verify only category B remains - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" + """Override to use a document_store with category metadata field.""" + DeleteByFilterTest.test_delete_by_filter(document_store) @pytest.mark.parametrize( "document_store", @@ -334,76 +317,65 @@ def test_delete_by_filter(self, document_store: AzureAISearchDocumentStore): indirect=True, ) def test_delete_by_filter_no_matches(self, document_store: AzureAISearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - # Try to delete documents with category="C" (no matches) - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"} - ) - assert deleted_count == 0 - assert document_store.count_documents() == 2 + """Override to use a document_store with category metadata field.""" + DeleteByFilterTest.test_delete_by_filter_no_matches(document_store) @pytest.mark.parametrize( "document_store", - [{"metadata_fields": {"category": str, "status": str}}], + [{"metadata_fields": {"category": str, "year": int, "status": str}}], indirect=True, ) - def test_update_by_filter(self, document_store: AzureAISearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 + def test_delete_by_filter_advanced_filters(self, document_store: AzureAISearchDocumentStore): + """Override to use a document_store with category, year, status metadata fields.""" + DeleteByFilterTest.test_delete_by_filter_advanced_filters(document_store) + + # Metadata fields required by haystack UpdateByFilterTest filterable_docs (chapter, name, page, number, date, etc.) + _FILTERABLE_DOCS_METADATA = { # noqa: RUF012 + "name": str, + "page": str, + "chapter": str, + "number": int, + "date": str, + "no_embedding": bool, + "updated": bool, + "extra_field": str, + } - # Update status for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - fields={"status": "published"}, - ) - assert updated_count == 2 + @pytest.mark.parametrize( + "document_store", + [{"metadata_fields": _FILTERABLE_DOCS_METADATA}], + indirect=True, + ) + def test_update_by_filter(self, document_store: AzureAISearchDocumentStore, filterable_docs): + """Override to use a document_store with metadata fields for filterable_docs.""" + UpdateByFilterTest.test_update_by_filter(document_store, filterable_docs) - # Verify the updates - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "A" - assert doc.meta["status"] == "published" - - # Verify category B still has draft status - draft_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "draft"} - ) - assert len(draft_docs) == 1 - assert draft_docs[0].meta["category"] == "B" + @pytest.mark.parametrize( + "document_store", + [{"metadata_fields": _FILTERABLE_DOCS_METADATA}], + indirect=True, + ) + def test_update_by_filter_no_matches(self, document_store: AzureAISearchDocumentStore, filterable_docs): + """Override to use a document_store with metadata fields for filterable_docs.""" + UpdateByFilterTest.test_update_by_filter_no_matches(document_store, filterable_docs) @pytest.mark.parametrize( "document_store", - [{"metadata_fields": {"category": str, "status": str}}], + [{"metadata_fields": _FILTERABLE_DOCS_METADATA}], indirect=True, ) - def test_update_by_filter_no_matches(self, document_store: AzureAISearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 + def test_update_by_filter_multiple_fields(self, document_store: AzureAISearchDocumentStore, filterable_docs): + """Override to use a document_store with metadata fields for filterable_docs.""" + UpdateByFilterTest.test_update_by_filter_multiple_fields(document_store, filterable_docs) - # Try to update documents with category="C" (no matches) - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"}, - fields={"status": "published"}, - ) - assert updated_count == 0 + @pytest.mark.parametrize( + "document_store", + [{"metadata_fields": {"category": str, "year": int, "status": str, "featured": bool}}], + indirect=True, + ) + def test_update_by_filter_advanced_filters(self, document_store: AzureAISearchDocumentStore): + """Override to use a document_store with category, year, status, featured metadata fields.""" + UpdateByFilterTest.test_update_by_filter_advanced_filters(document_store) @pytest.mark.parametrize( "document_store", diff --git a/integrations/chroma/pyproject.toml b/integrations/chroma/pyproject.toml index 8283899e3d..c5eb5ff205 100644 --- a/integrations/chroma/pyproject.toml +++ b/integrations/chroma/pyproject.toml @@ -22,7 +22,10 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "chromadb>=1.0.2"] +dependencies = [ + "haystack-ai>=2.24.0", + "chromadb>=1.0.2" +] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/chroma#readme" diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index 370156246c..d7e1eb3eb8 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -856,12 +856,13 @@ def delete_all_documents(self, *, recreate_index: bool = False) -> None: else: collection = self._collection.get() ids = collection.get("ids", []) - self._collection.delete(ids=ids) # type: ignore - logger.info( - "Deleted all the {n_docs} documents from the collection '{name}'.", - name=self._collection_name, - n_docs=len(ids), - ) + if ids: + self._collection.delete(ids=ids) # type: ignore + logger.info( + "Deleted all the {n_docs} documents from the collection '{name}'.", + name=self._collection_name, + n_docs=len(ids), + ) except Exception as e: msg = f"Failed to delete all documents from ChromaDB: {e!s}" raise DocumentStoreError(msg) from e @@ -895,12 +896,13 @@ async def delete_all_documents_async(self, *, recreate_index: bool = False) -> N else: collection = await self._async_collection.get() ids = collection.get("ids", []) - await self._async_collection.delete(ids=ids) # type: ignore - logger.info( - "Deleted all the {n_docs} documents from the collection '{name}'.", - name=self._collection_name, - n_docs=len(ids), - ) + if ids: + await self._async_collection.delete(ids=ids) # type: ignore + logger.info( + "Deleted all the {n_docs} documents from the collection '{name}'.", + name=self._collection_name, + n_docs=len(ids), + ) except Exception as e: msg = f"Failed to delete all documents from ChromaDB: {e!s}" diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 3d52a6d3a2..dacd50cd35 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -14,8 +14,12 @@ from haystack.testing.document_store import ( TEST_EMBEDDING_1, CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, DeleteDocumentsTest, + FilterableDocsFixtureMixin, FilterDocumentsTest, + UpdateByFilterTest, ) from haystack_integrations.document_stores.chroma import ChromaDocumentStore @@ -34,7 +38,15 @@ def clear_chroma_system_cache(): SharedSystemClient.clear_system_cache() -class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, FilterDocumentsTest): +class TestDocumentStore( + CountDocumentsTest, + DeleteDocumentsTest, + FilterDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, + DeleteAllTest, + DeleteByFilterTest, +): """ Common test cases will be provided by `DocumentStoreBaseTests` but you can add more to this class. @@ -465,88 +477,6 @@ def test_delete_all_documents_no_index_recreation(self, document_store: ChromaDo assert len(results) == 1 assert results[0].content == "New document after delete all" - def test_delete_by_filter(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # delete documents with category="A" - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - # verify only category B remains - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - def test_delete_by_filter_no_matches(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - # delete documents with category="C" (no matches) - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"} - ) - assert deleted_count == 0 - assert document_store.count_documents() == 2 - - def test_update_by_filter(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # update status for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 - - # verify the updated documents have the new metadata - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["category"] == "A" - - # Verify documents with category="B" were not updated - unpublished_docs = document_store.filter_documents( - filters={"field": "meta.category", "operator": "==", "value": "B"} - ) - assert len(unpublished_docs) == 1 - assert unpublished_docs[0].meta["status"] == "draft" - - def test_update_by_filter_no_matches(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - # Try to update documents with category="C" (no matches) - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} - ) - assert updated_count == 0 - assert document_store.count_documents() == 2 - @pytest.mark.integration def test_search_embeddings(self, document_store: ChromaDocumentStore): query_embedding = TEST_EMBEDDING_1 diff --git a/integrations/elasticsearch/pyproject.toml b/integrations/elasticsearch/pyproject.toml index f8c32250c3..75f9f07506 100644 --- a/integrations/elasticsearch/pyproject.toml +++ b/integrations/elasticsearch/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.22.0", + "haystack-ai>=2.24.0", "elasticsearch>=8,<9", "aiohttp>=3.9.0" # for async support https://elasticsearch-py.readthedocs.io/en/latest/async.html#valueerror-when-initializing-asyncelasticsearch ] diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index 58f326efb1..b9001743cb 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -10,7 +10,7 @@ from haystack.dataclasses.document import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack.testing.document_store import DocumentStoreBaseTests +from haystack.testing.document_store import DocumentStoreBaseExtendedTests from haystack.utils import Secret from haystack.utils.auth import TokenSecret @@ -199,9 +199,9 @@ def test_client_initialization_with_api_key_string(_mock_async_es, _mock_es): @pytest.mark.integration -class TestDocumentStore(DocumentStoreBaseTests): +class TestDocumentStore(DocumentStoreBaseExtendedTests): """ - Common test cases will be provided by `DocumentStoreBaseTests` but + Common test cases will be provided by `DocumentStoreBaseExtendedTests` but you can add more to this class. """ @@ -501,74 +501,6 @@ def test_delete_all_documents_index_recreation(self, document_store: Elasticsear assert len(results) == 1 assert results[0].content == "New document after delete all" - def test_delete_all_documents_no_index_recreation(self, document_store: ElasticsearchDocumentStore): - docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - document_store.delete_all_documents(recreate_index=False, refresh=True) - assert document_store.count_documents() == 0 - - new_doc = Document(id="3", content="New document after delete all") - document_store.write_documents([new_doc]) - assert document_store.count_documents() == 1 - - results = document_store.filter_documents() - assert len(results) == 1 - assert results[0].content == "New document after delete all" - - def test_delete_by_filter(self, document_store: ElasticsearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Delete documents with category="A" - deleted_count = document_store.delete_by_filter( - filters={"field": "category", "operator": "==", "value": "A"}, refresh=True - ) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - # Verify only category B remains - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - def test_update_by_filter(self, document_store: ElasticsearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update status for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "category", "operator": "==", "value": "A"}, - meta={"status": "published"}, - refresh=True, - ) - assert updated_count == 2 - - # Verify the updates - published_docs = document_store.filter_documents( - filters={"field": "status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "A" - assert doc.meta["status"] == "published" - - # Verify category B still has draft status - draft_docs = document_store.filter_documents(filters={"field": "status", "operator": "==", "value": "draft"}) - assert len(draft_docs) == 1 - assert draft_docs[0].meta["category"] == "B" - def test_count_documents_by_filter(self, document_store: ElasticsearchDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active"}), diff --git a/integrations/mongodb_atlas/pyproject.toml b/integrations/mongodb_atlas/pyproject.toml index 77c0dcad5b..93172463de 100644 --- a/integrations/mongodb_atlas/pyproject.toml +++ b/integrations/mongodb_atlas/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.22.0", + "haystack-ai>=2.24.0", "pymongo[srv]>=4.13.0" ] diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index df7d9c23c6..ff563e8686 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -10,7 +10,7 @@ from haystack.dataclasses.document import ByteStream, Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack.testing.document_store import DocumentStoreBaseTests +from haystack.testing.document_store import DocumentStoreBaseExtendedTests from haystack.utils import Secret from pymongo import MongoClient from pymongo.driver_info import DriverInfo @@ -116,7 +116,7 @@ def test_document_conversion_methods_with_custom_field_names(self, _mock_client) reason="No MongoDB Atlas connection string provided", ) @pytest.mark.integration -class TestDocumentStore(DocumentStoreBaseTests): +class TestDocumentStore(DocumentStoreBaseExtendedTests): @pytest.fixture def document_store(self): database_name = "haystack_integration_test" @@ -431,70 +431,6 @@ def test_custom_content_field(self): finally: database[collection_name].drop() - def test_delete_by_filter(self, document_store: MongoDBAtlasDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Delete documents with category="A" - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - # Verify the remaining document is the one with category="B" - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - def test_update_by_filter(self, document_store: MongoDBAtlasDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update documents with category="A" to have status="published" - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 - - # Verify the updated documents have the new metadata - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["category"] == "A" - - # Verify documents with category="B" were not updated - unpublished_docs = document_store.filter_documents( - filters={"field": "meta.category", "operator": "==", "value": "B"} - ) - assert len(unpublished_docs) == 1 - assert "status" not in unpublished_docs[0].meta - - def test_delete_all_documents(self, document_store: MongoDBAtlasDocumentStore): - docs = [Document(id="1", content="first doc"), Document(id="2", content="second doc")] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - - def test_delete_all_documents_empty_collection(self, document_store: MongoDBAtlasDocumentStore): - assert document_store.count_documents() == 0 - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - def test_delete_all_documents_with_recreate_collection(self, document_store: MongoDBAtlasDocumentStore): docs = [Document(id="1", content="first doc"), Document(id="2", content="second doc")] document_store.write_documents(docs) diff --git a/integrations/opensearch/pyproject.toml b/integrations/opensearch/pyproject.toml index d6dd9e4487..3cf9940115 100644 --- a/integrations/opensearch/pyproject.toml +++ b/integrations/opensearch/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ ] dependencies = [ - "haystack-ai>=2.22.0", + "haystack-ai>=2.24.0", "opensearch-py[async]>=3.0.0" ] diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index a5b4f2d09d..439d2335ab 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -737,7 +737,8 @@ def delete_by_filter(self, filters: dict[str, Any], refresh: bool = False) -> in :param filters: The filters to apply to select documents for deletion. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) :param refresh: If True, OpenSearch refreshes all shards involved in the delete by query after the request - completes. If False, no refresh is performed. For more details, see the + completes so that subsequent reads (e.g. count_documents) see the update. If False, no refresh is + performed (better for bulk deletes). For more details, see the [OpenSearch delete_by_query refresh documentation](https://opensearch.org/docs/latest/api-reference/document-apis/delete-by-query/). :returns: The number of documents deleted. """ @@ -747,6 +748,7 @@ def delete_by_filter(self, filters: dict[str, Any], refresh: bool = False) -> in try: normalized_filters = normalize_filters(filters) body = {"query": {"bool": {"filter": normalized_filters}}} + result = self._client.delete_by_query(index=self._index, body=body, refresh=refresh) deleted_count = result.get("deleted", 0) logger.info( @@ -766,8 +768,8 @@ async def delete_by_filter_async(self, filters: dict[str, Any], refresh: bool = :param filters: The filters to apply to select documents for deletion. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) :param refresh: If True, OpenSearch refreshes all shards involved in the delete by query after the request - completes. If False, no refresh is performed. For more details, see the - [OpenSearch delete_by_query refresh documentation](https://opensearch.org/docs/latest/api-reference/document-apis/delete-by-query/). + completes so that subsequent reads see the update. If False, no refresh is performed. For more details, + see the [OpenSearch delete_by_query refresh documentation](https://opensearch.org/docs/latest/api-reference/document-apis/delete-by-query/). :returns: The number of documents deleted. """ await self._ensure_initialized_async() diff --git a/integrations/opensearch/tests/conftest.py b/integrations/opensearch/tests/conftest.py index 2ae8462faa..72b112b5ae 100644 --- a/integrations/opensearch/tests/conftest.py +++ b/integrations/opensearch/tests/conftest.py @@ -19,8 +19,8 @@ def _get_unique_index_name() -> str: @pytest.fixture def document_store(): """ - We use this document store for basic tests and for testing filters. - `return_embedding` is set to True because in filters tests we compare embeddings. + OpenSearch document store instance. + Used by document_store and by TestDocumentStore to override the base test class fixture. """ hosts = ["https://localhost:9200"] index = _get_unique_index_name() diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 145949be55..9107788dce 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -9,7 +9,7 @@ from haystack.dataclasses.document import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest +from haystack.testing.document_store import DocumentStoreBaseExtendedTests from opensearchpy.exceptions import RequestError from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore @@ -151,12 +151,17 @@ def test_routing_in_delete(mock_bulk, document_store): @pytest.mark.integration -class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): +class TestDocumentStore(DocumentStoreBaseExtendedTests): """ - Common test cases will be provided by `DocumentStoreBaseTests` but + Common test cases will be provided by `DocumentStoreBaseExtendedTests` but you can add more to this class. """ + @pytest.fixture + def document_store(self, document_store): + """Override base class fixture to provide OpenSearch document store.""" + yield document_store + def assert_documents_are_equal(self, received: list[Document], expected: list[Document]): """ The OpenSearchDocumentStore.filter_documents() method returns a Documents with their score set. @@ -523,76 +528,6 @@ def test_delete_all_documents_index_recreation(self, document_store: OpenSearchD assert len(results) == 1 assert results[0].content == "New document after delete all" - def test_delete_all_documents_no_index_recreation(self, document_store: OpenSearchDocumentStore): - docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - document_store.delete_all_documents(recreate_index=False, refresh=True) - assert document_store.count_documents() == 0 - - new_doc = Document(id="3", content="New document after delete all") - document_store.write_documents([new_doc]) - assert document_store.count_documents() == 1 - - results = document_store.filter_documents() - assert len(results) == 1 - assert results[0].content == "New document after delete all" - - def test_delete_by_filter(self, document_store: OpenSearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Delete documents with category="A" - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, refresh=True - ) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - # Verify only category B remains - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - def test_update_by_filter(self, document_store: OpenSearchDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update status for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - meta={"status": "published"}, - refresh=True, - ) - assert updated_count == 2 - - # Verify the updates - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "A" - assert doc.meta["status"] == "published" - - # Verify category B still has draft status - draft_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "draft"} - ) - assert len(draft_docs) == 1 - assert draft_docs[0].meta["category"] == "B" - def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active"}), diff --git a/integrations/pgvector/pyproject.toml b/integrations/pgvector/pyproject.toml index 89cbcbce4b..3dbd0d5e04 100644 --- a/integrations/pgvector/pyproject.toml +++ b/integrations/pgvector/pyproject.toml @@ -22,7 +22,11 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "pgvector>=0.3.0", "psycopg[binary]"] +dependencies = [ + "haystack-ai>=2.24.0", + "pgvector>=0.3.0", + "psycopg[binary]" +] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations" diff --git a/integrations/pgvector/tests/test_document_store.py b/integrations/pgvector/tests/test_document_store.py index ab7a94bfc8..3ad679def7 100644 --- a/integrations/pgvector/tests/test_document_store.py +++ b/integrations/pgvector/tests/test_document_store.py @@ -9,14 +9,30 @@ from haystack.dataclasses.document import ByteStream, Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest +from haystack.testing.document_store import ( + CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, + DeleteDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, + WriteDocumentsTest, +) from haystack.utils import Secret from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore @pytest.mark.integration -class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): +class TestDocumentStore( + CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, + DeleteDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, + WriteDocumentsTest, +): def test_write_documents(self, document_store: PgvectorDocumentStore): docs = [Document(id="1")] assert document_store.write_documents(docs) == 1 @@ -53,13 +69,6 @@ def test_connection_check_and_recreation(self, document_store: PgvectorDocumentS same_connection = document_store._connection assert same_connection is document_store._connection - def test_delete_all_documents(self, document_store: PgvectorDocumentStore) -> None: - document_store.write_documents([Document(id=str(i)) for i in range(10)]) - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - document_store.write_documents([Document(id="1")]) - assert document_store.count_documents() == 1 - def test_invalid_connection_string(self, monkeypatch): monkeypatch.setenv("PG_CONN_STR", "invalid_connection_string") document_store = PgvectorDocumentStore() @@ -262,215 +271,6 @@ def test_delete_table_first_call(document_store): document_store.delete_table() # if throw error, test fails -@pytest.mark.integration -def test_delete_by_filter(document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - deleted_count = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "A"}) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - deleted_count = document_store.delete_by_filter(filters={"field": "meta.year", "operator": "==", "value": 2023}) - assert deleted_count == 1 - assert document_store.count_documents() == 0 - - -@pytest.mark.integration -def test_delete_by_filter_no_matches(document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - deleted_count = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "C"}) - assert deleted_count == 0 - assert document_store.count_documents() == 2 - - -@pytest.mark.integration -def test_delete_by_filter_advanced_filters(document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "published"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # AND condition - deleted_count = document_store.delete_by_filter( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - } - ) - assert deleted_count == 1 - assert document_store.count_documents() == 2 - - # OR condition - deleted_count = document_store.delete_by_filter( - filters={ - "operator": "OR", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "B"}, - {"field": "meta.status", "operator": "==", "value": "published"}, - ], - } - ) - assert deleted_count == 2 - assert document_store.count_documents() == 0 - - -@pytest.mark.integration -def test_update_by_filter(document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # update status for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 - - # verify - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "A" - assert doc.meta["status"] == "published" - - # Verify category B still has draft status - draft_docs = document_store.filter_documents(filters={"field": "meta.status", "operator": "==", "value": "draft"}) - assert len(draft_docs) == 1 - assert draft_docs[0].meta["category"] == "B" - - -@pytest.mark.integration -def test_update_by_filter_multiple_fields(document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "A", "year": 2023}), - Document(content="Doc 3", meta={"category": "B", "year": 2024}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # update multiple fields for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - meta={"status": "published", "priority": "high", "reviewed": True}, - ) - assert updated_count == 2 - - # verify - published_docs = document_store.filter_documents(filters={"field": "meta.category", "operator": "==", "value": "A"}) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["priority"] == "high" - assert doc.meta["reviewed"] is True - assert doc.meta["year"] == 2023 # Original field should still be present - - # verify category B was not updated - b_docs = document_store.filter_documents(filters={"field": "meta.category", "operator": "==", "value": "B"}) - assert len(b_docs) == 1 - assert "status" not in b_docs[0].meta - assert "priority" not in b_docs[0].meta - - -@pytest.mark.integration -def test_update_by_filter_no_matches(document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - # update documents with category="C" (no matches) - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} - ) - assert updated_count == 0 - assert document_store.count_documents() == 2 - - # verify no documents were updated - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 0 - - -@pytest.mark.integration -def test_update_by_filter_advanced_filters(document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "draft"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # AND condition - updated_count = document_store.update_by_filter( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - }, - meta={"status": "published"}, - ) - assert updated_count == 1 - - # verify only one document was updated - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 1 - assert published_docs[0].meta["category"] == "A" - assert published_docs[0].meta["year"] == 2023 - - # OR condition - updated_count = document_store.update_by_filter( - filters={ - "operator": "OR", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "B"}, - {"field": "meta.year", "operator": "==", "value": 2024}, - ], - }, - meta={"featured": True}, - ) - assert updated_count == 2 - - featured_docs = document_store.filter_documents(filters={"field": "meta.featured", "operator": "==", "value": True}) - assert len(featured_docs) == 2 - - @pytest.mark.integration def test_update_by_filter_empty_meta_raises_error(document_store: PgvectorDocumentStore): docs = [Document(content="Doc 1", meta={"category": "A"})] diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index f079757b8c..56900f1e9e 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.22.0", + "haystack-ai>=2.24.0", "pinecone[asyncio]>=7.0.0", ] diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py index 726f36f06f..e0d7885cfd 100644 --- a/integrations/pinecone/tests/conftest.py +++ b/integrations/pinecone/tests/conftest.py @@ -56,7 +56,8 @@ def delete_documents_and_wait(filters): yield store try: - store._index.delete(delete_all=True, namespace=namespace) + if store._index is not None: + store._index.delete(delete_all=True, namespace=namespace) except NotFoundException: pass @@ -98,7 +99,8 @@ async def delete_documents_and_wait_async(filters): yield store try: - await store._async_index.delete(delete_all=True, namespace=namespace) - await store.close_async() + if store._async_index is not None: + await store._async_index.delete(delete_all=True, namespace=namespace) + await store.close_async() except NotFoundException: pass diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 3f949efac3..864b6127a1 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -11,7 +11,15 @@ from haystack import Document from haystack.components.preprocessors import DocumentSplitter from haystack.components.retrievers import SentenceWindowRetriever -from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest +from haystack.testing.document_store import ( + CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, + DeleteDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, + WriteDocumentsTest, +) from haystack.utils import Secret from pinecone import Pinecone, PodSpec, ServerlessSpec @@ -260,7 +268,15 @@ def test_serverless_index_creation_from_scratch(delete_sleep_time): @pytest.mark.integration @pytest.mark.skipif(not os.environ.get("PINECONE_API_KEY"), reason="PINECONE_API_KEY not set") -class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest): +class TestDocumentStore( + CountDocumentsTest, + DeleteDocumentsTest, + WriteDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, + DeleteAllTest, + DeleteByFilterTest, +): def test_write_documents(self, document_store: PineconeDocumentStore): docs = [Document(id="1")] assert document_store.write_documents(docs) == 1 @@ -279,19 +295,6 @@ def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentSt @pytest.mark.skip(reason="Pinecone creates a namespace only when the first document is written") def test_delete_documents_empty_document_store(self, document_store: PineconeDocumentStore): ... - def test_delete_all_documents(self, document_store: PineconeDocumentStore): - docs = [Document(content="first doc"), Document(content="second doc")] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - - def test_delete_all_documents_empty_collection(self, document_store: PineconeDocumentStore): - assert document_store.count_documents() == 0 - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - def test_embedding_retrieval(self, document_store: PineconeDocumentStore): query_embedding = [0.1] * 768 most_similar_embedding = [0.8] * 768 @@ -350,103 +353,6 @@ def test_sentence_window_retriever(self, document_store: PineconeDocumentStore): assert len(result["context_windows"]) == 1 - def test_delete_by_filter(self, document_store: PineconeDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs) - - # delete documents with category="A" - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - # verify only category B remains - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - def test_delete_by_filter_no_matches(self, document_store: PineconeDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - - # try to delete documents with category="C" (no matches) - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"} - ) - assert deleted_count == 0 - assert document_store.count_documents() == 2 - - def test_update_by_filter(self, document_store: PineconeDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - document_store.write_documents(docs) - - # update status for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 - - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "A" - assert doc.meta["status"] == "published" - - def test_update_by_filter_multiple_fields(self, document_store: PineconeDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft", "priority": "low"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft", "priority": "low"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft", "priority": "low"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update multiple fields for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - meta={"status": "published", "priority": "high"}, - ) - assert updated_count == 2 - - # Verify the updates - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "A" - assert doc.meta["status"] == "published" - assert doc.meta["priority"] == "high" - - def test_update_by_filter_no_matches(self, document_store: PineconeDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - # Try to update documents with category="C" (no matches) - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} - ) - assert updated_count == 0 - assert document_store.count_documents() == 2 - def test_count_documents_by_filter(self, document_store: PineconeDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "draft"}), diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml index f4284ded16..acbff15a4c 100644 --- a/integrations/qdrant/pyproject.toml +++ b/integrations/qdrant/pyproject.toml @@ -25,7 +25,10 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "qdrant-client>=1.12.0"] +dependencies = [ + "haystack-ai>=2.24.0", + "qdrant-client>=1.12.0" +] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations" diff --git a/integrations/qdrant/tests/test_document_store.py b/integrations/qdrant/tests/test_document_store.py index 15c9525e2f..74cc0c8067 100644 --- a/integrations/qdrant/tests/test_document_store.py +++ b/integrations/qdrant/tests/test_document_store.py @@ -7,7 +7,11 @@ from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import ( CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, DeleteDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, WriteDocumentsTest, _random_embeddings, ) @@ -22,7 +26,15 @@ ) -class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): +class TestQdrantDocumentStore( + CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, + DeleteDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, + WriteDocumentsTest, +): @pytest.fixture def document_store(self) -> QdrantDocumentStore: return QdrantDocumentStore( @@ -302,21 +314,6 @@ def test_set_up_collection_with_dimension_mismatch(self): with pytest.raises(ValueError, match="different vector size"): document_store._set_up_collection("test_collection", 768, False, "cosine", False, False) - def test_delete_all_documents_no_index_recreation(self, document_store): - document_store._initialize_client() - - # write some documents - docs = [Document(id=str(i)) for i in range(5)] - document_store.write_documents(docs) - - # delete all documents without recreating the index - document_store.delete_all_documents(recreate_index=False) - assert document_store.count_documents() == 0 - - # ensure the collection still exists by writing documents again - document_store.write_documents(docs) - assert document_store.count_documents() == 5 - def test_delete_all_documents_index_recreation(self, document_store): document_store._initialize_client() @@ -340,183 +337,6 @@ def test_delete_all_documents_index_recreation(self, document_store): document_store.write_documents(docs) assert document_store.count_documents() == 5 - def test_delete_by_filter(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - - # Verify only category B remains - remaining_docs = document_store.filter_documents() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - # Delete remaining document by year - deleted_count = document_store.delete_by_filter(filters={"field": "meta.year", "operator": "==", "value": 2023}) - assert deleted_count == 1 - assert document_store.count_documents() == 0 - - def test_delete_by_filter_no_matches(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - # try to delete documents with category="C" (no matches) - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"} - ) - assert deleted_count == 0 - assert document_store.count_documents() == 2 - - def test_delete_by_filter_advanced_filters(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "published"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # AND condition (matches only Doc 1) - deleted_count = document_store.delete_by_filter( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - } - ) - assert deleted_count == 1 - assert document_store.count_documents() == 2 - - # OR condition (matches Doc 2 and Doc 3) - deleted_count = document_store.delete_by_filter( - filters={ - "operator": "OR", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "B"}, - {"field": "meta.status", "operator": "==", "value": "published"}, - ], - } - ) - assert deleted_count == 2 - assert document_store.count_documents() == 0 - - def test_update_by_filter(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update status for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 - - # Verify the updated documents have the new metadata - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["category"] == "A" - - # Verify documents with category="B" were not updated - draft_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "draft"} - ) - assert len(draft_docs) == 1 - assert draft_docs[0].meta["category"] == "B" - - def test_update_by_filter_multiple_fields(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "A", "year": 2023}), - Document(content="Doc 3", meta={"category": "B", "year": 2024}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update multiple fields for category="A" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - meta={"status": "published", "reviewed": True}, - ) - assert updated_count == 2 - - # Verify updates - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["reviewed"] is True - assert doc.meta["category"] == "A" - assert doc.meta["year"] == 2023 # Existing field preserved - - def test_update_by_filter_no_matches(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - # Try to update documents with category="C" (no matches) - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} - ) - assert updated_count == 0 - assert document_store.count_documents() == 2 - - def test_update_by_filter_advanced_filters(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "draft"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update with AND condition - updated_count = document_store.update_by_filter( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - }, - meta={"status": "published"}, - ) - assert updated_count == 1 - - # Verify only one document was updated - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 1 - assert published_docs[0].meta["category"] == "A" - assert published_docs[0].meta["year"] == 2023 - def test_update_by_filter_preserves_vectors(self, document_store: QdrantDocumentStore): """Test that update_by_filter preserves document embeddings.""" docs = [ diff --git a/integrations/weaviate/pyproject.toml b/integrations/weaviate/pyproject.toml index d52d4db397..f27c8435d4 100644 --- a/integrations/weaviate/pyproject.toml +++ b/integrations/weaviate/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.22.0", + "haystack-ai>=2.24.0", "weaviate-client>=4.9", "python-dateutil", ] diff --git a/integrations/weaviate/tests/test_document_store.py b/integrations/weaviate/tests/test_document_store.py index 47623cb89b..32747c82d2 100644 --- a/integrations/weaviate/tests/test_document_store.py +++ b/integrations/weaviate/tests/test_document_store.py @@ -13,10 +13,7 @@ from haystack.dataclasses.document import Document from haystack.document_stores.errors import DocumentStoreError from haystack.testing.document_store import ( - CountDocumentsTest, - DeleteDocumentsTest, - FilterDocumentsTest, - WriteDocumentsTest, + DocumentStoreBaseExtendedTests, create_filterable_docs, ) from haystack.utils.auth import Secret @@ -47,7 +44,7 @@ def test_init_is_lazy(_mock_client): @pytest.mark.integration -class TestWeaviateDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest, FilterDocumentsTest): +class TestWeaviateDocumentStore(DocumentStoreBaseExtendedTests): @pytest.fixture def document_store(self, request) -> WeaviateDocumentStore: # Use a different index for each test so we can run them in parallel @@ -797,13 +794,6 @@ def test_connect_to_embedded(self): document_store = WeaviateDocumentStore(embedded_options=EmbeddedOptions()) assert document_store.client - def test_delete_all_documents(self, document_store): - docs = [Document(content="test doc 1"), Document(content="test doc 2")] - assert document_store.write_documents(docs) == 2 - assert document_store.count_documents() == 2 - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - def test_delete_all_documents_recreate(self, document_store): docs = [Document(content="test doc 1"), Document(content="test doc 2")] assert document_store.write_documents(docs) == 2 @@ -835,46 +825,6 @@ def test_delete_all_documents_excessive_batch_size(self, document_store, caplog) assert document_store.count_documents() == 5 assert "Not all documents have been deleted." in caplog.text - def test_delete_by_filter(self, document_store): - docs = [ - Document(content="Doc 1", meta={"category": "TypeA"}), - Document(content="Doc 2", meta={"category": "TypeB"}), - Document(content="Doc 3", meta={"category": "TypeA"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Delete documents with category="TypeA" - deleted_count = document_store.delete_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "TypeA"} - ) - assert deleted_count == 2 - assert document_store.count_documents() == 1 - - def test_update_by_filter(self, document_store): - docs = [ - Document(content="Doc 1", meta={"category": "TypeA", "status": "draft"}), - Document(content="Doc 2", meta={"category": "TypeB", "status": "draft"}), - Document(content="Doc 3", meta={"category": "TypeA", "status": "draft"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - # Update status for category="TypeA" documents - updated_count = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "TypeA"}, meta={"status": "published"} - ) - assert updated_count == 2 - - # Verify the updates - published_docs = document_store.filter_documents( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "TypeA" - assert doc.meta["status"] == "published" - def test_update_by_filter_with_pagination(self, document_store, monkeypatch): # Reduce DEFAULT_QUERY_LIMIT to test pagination without creating 10000+ documents monkeypatch.setattr("haystack_integrations.document_stores.weaviate.document_store.DEFAULT_QUERY_LIMIT", 100)