From 6a13741e281ff5c6be8dab3eacc589c6c9205457 Mon Sep 17 00:00:00 2001 From: Ramtin Yazdanian Date: Thu, 28 Aug 2025 18:32:28 +0200 Subject: [PATCH 1/3] use pymupdf for fp and raise PIL image size limit --- graphai/core/common/fingerprinting.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/graphai/core/common/fingerprinting.py b/graphai/core/common/fingerprinting.py index 76173958..8bc503dc 100644 --- a/graphai/core/common/fingerprinting.py +++ b/graphai/core/common/fingerprinting.py @@ -7,12 +7,15 @@ import imagehash import numpy as np from PIL import Image -import pdf2image +import pymupdf from fuzzywuzzy import fuzz from graphai.core.common.common_utils import file_exists, is_pdf +Image.MAX_IMAGE_PIXELS = 933120000 + + def perceptual_hash_text(s): """ Computes the perceptual hash of a strong @@ -137,9 +140,10 @@ def perceptual_hash_pdf(input_filename_with_path, hash_size=16): if not file_exists(input_filename_with_path) or not is_pdf(input_filename_with_path): print(f'File {input_filename_with_path} does not exist or is not in the right format') return None - pdf_imageset = pdf2image.convert_from_path(input_filename_with_path) + pdf_imageset = pymupdf.open(input_filename_with_path) results = hashlib.md5( - ''.join(str(imagehash.dhash(x, hash_size=hash_size)) for x in pdf_imageset).encode('utf8') + ''.join(str(imagehash.dhash(x.get_pixmap().pil_image(), + hash_size=hash_size)) for x in pdf_imageset).encode('utf8') ).hexdigest() return str(results) From 75ce8ded9803549f38f653bcd8f0b0f62be8f651 Mon Sep 17 00:00:00 2001 From: Ramtin Yazdanian Date: Mon, 1 Sep 2025 11:27:52 +0200 Subject: [PATCH 2/3] make default value of filter_by_date dependent on index name --- graphai/api/retrieval/schemas.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/graphai/api/retrieval/schemas.py b/graphai/api/retrieval/schemas.py index 283e04fe..0811bd09 100644 --- a/graphai/api/retrieval/schemas.py +++ b/graphai/api/retrieval/schemas.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, ValidationInfo, field_validator from typing import Union, List, Dict, Literal, Any @@ -31,16 +31,26 @@ class RetrievalRequest(BaseModel): default=False ) - filter_by_date: bool = Field( + filter_by_date: Union[bool, None] = Field( title="Filter by current date", description="If True, if the requested index has 'from' and 'until' fields, only returns documents " "that are available at the current date and time based on those two fields. Basically " "a smart custom filter that doesn't require the user to manually provide the current " "datetime and ask for 'from' to be before it and for 'until' to be after it. " "If the index does not have 'from' and 'until' fields, this results in an empty response.", - default=False + default=None ) + @field_validator("filter_by_date", mode='after') + @classmethod + def set_default_filter_by_date_value(cls, value: Union[bool, None], info: ValidationInfo) -> bool: + if value is None: + if info.data['index'].startswith('course_'): + return True + return False + return value + + class RetrievalResponse(BaseModel): n_results: int = Field( From 9bd1e1541b6552c884bd068e40638125805b346a Mon Sep 17 00:00:00 2001 From: Ramtin Yazdanian Date: Mon, 1 Sep 2025 15:01:35 +0200 Subject: [PATCH 3/3] do the whole thing through the endpoint handler --- graphai/api/retrieval/router.py | 6 ++++++ graphai/api/retrieval/schemas.py | 12 +----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/graphai/api/retrieval/router.py b/graphai/api/retrieval/router.py index 2a22f579..a1ee5a5f 100644 --- a/graphai/api/retrieval/router.py +++ b/graphai/api/retrieval/router.py @@ -44,6 +44,12 @@ async def retrieve_from_es_index(data: RetrievalRequest, index_to_search_in = data.index return_scores = data.return_scores filter_by_date = data.filter_by_date + # Check for None (default) value in flag and mutate based on selected index + if filter_by_date is None: + if index_to_search_in.startswith('course_'): + filter_by_date = True + else: + filter_by_date = False if not has_rag_access_rights(current_user.username, index_to_search_in): return INSUFFICIENT_ACCESS_ERROR results = retrieve_from_es_job(text, index_to_search_in, filters, limit, return_scores, filter_by_date) diff --git a/graphai/api/retrieval/schemas.py b/graphai/api/retrieval/schemas.py index 0811bd09..ab93d2d8 100644 --- a/graphai/api/retrieval/schemas.py +++ b/graphai/api/retrieval/schemas.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, Field, ValidationInfo, field_validator +from pydantic import BaseModel, Field from typing import Union, List, Dict, Literal, Any @@ -41,16 +41,6 @@ class RetrievalRequest(BaseModel): default=None ) - @field_validator("filter_by_date", mode='after') - @classmethod - def set_default_filter_by_date_value(cls, value: Union[bool, None], info: ValidationInfo) -> bool: - if value is None: - if info.data['index'].startswith('course_'): - return True - return False - return value - - class RetrievalResponse(BaseModel): n_results: int = Field(