diff --git a/graphai/api/retrieval/router.py b/graphai/api/retrieval/router.py index 2a22f579..a1ee5a5f 100644 --- a/graphai/api/retrieval/router.py +++ b/graphai/api/retrieval/router.py @@ -44,6 +44,12 @@ async def retrieve_from_es_index(data: RetrievalRequest, index_to_search_in = data.index return_scores = data.return_scores filter_by_date = data.filter_by_date + # Check for None (default) value in flag and mutate based on selected index + if filter_by_date is None: + if index_to_search_in.startswith('course_'): + filter_by_date = True + else: + filter_by_date = False if not has_rag_access_rights(current_user.username, index_to_search_in): return INSUFFICIENT_ACCESS_ERROR results = retrieve_from_es_job(text, index_to_search_in, filters, limit, return_scores, filter_by_date) diff --git a/graphai/api/retrieval/schemas.py b/graphai/api/retrieval/schemas.py index 283e04fe..ab93d2d8 100644 --- a/graphai/api/retrieval/schemas.py +++ b/graphai/api/retrieval/schemas.py @@ -31,14 +31,14 @@ class RetrievalRequest(BaseModel): default=False ) - filter_by_date: bool = Field( + filter_by_date: Union[bool, None] = Field( title="Filter by current date", description="If True, if the requested index has 'from' and 'until' fields, only returns documents " "that are available at the current date and time based on those two fields. Basically " "a smart custom filter that doesn't require the user to manually provide the current " "datetime and ask for 'from' to be before it and for 'until' to be after it. " "If the index does not have 'from' and 'until' fields, this results in an empty response.", - default=False + default=None ) diff --git a/graphai/core/common/fingerprinting.py b/graphai/core/common/fingerprinting.py index 76173958..8bc503dc 100644 --- a/graphai/core/common/fingerprinting.py +++ b/graphai/core/common/fingerprinting.py @@ -7,12 +7,15 @@ import imagehash import numpy as np from PIL import Image -import pdf2image +import pymupdf from fuzzywuzzy import fuzz from graphai.core.common.common_utils import file_exists, is_pdf +Image.MAX_IMAGE_PIXELS = 933120000 + + def perceptual_hash_text(s): """ Computes the perceptual hash of a strong @@ -137,9 +140,10 @@ def perceptual_hash_pdf(input_filename_with_path, hash_size=16): if not file_exists(input_filename_with_path) or not is_pdf(input_filename_with_path): print(f'File {input_filename_with_path} does not exist or is not in the right format') return None - pdf_imageset = pdf2image.convert_from_path(input_filename_with_path) + pdf_imageset = pymupdf.open(input_filename_with_path) results = hashlib.md5( - ''.join(str(imagehash.dhash(x, hash_size=hash_size)) for x in pdf_imageset).encode('utf8') + ''.join(str(imagehash.dhash(x.get_pixmap().pil_image(), + hash_size=hash_size)) for x in pdf_imageset).encode('utf8') ).hexdigest() return str(results)