Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions graphai/api/retrieval/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ async def retrieve_from_es_index(data: RetrievalRequest,
index_to_search_in = data.index
return_scores = data.return_scores
filter_by_date = data.filter_by_date
# Check for None (default) value in flag and mutate based on selected index
if filter_by_date is None:
if index_to_search_in.startswith('course_'):
filter_by_date = True
else:
filter_by_date = False
if not has_rag_access_rights(current_user.username, index_to_search_in):
return INSUFFICIENT_ACCESS_ERROR
results = retrieve_from_es_job(text, index_to_search_in, filters, limit, return_scores, filter_by_date)
Expand Down
4 changes: 2 additions & 2 deletions graphai/api/retrieval/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ class RetrievalRequest(BaseModel):
default=False
)

filter_by_date: bool = Field(
filter_by_date: Union[bool, None] = Field(
title="Filter by current date",
description="If True, if the requested index has 'from' and 'until' fields, only returns documents "
"that are available at the current date and time based on those two fields. Basically "
"a smart custom filter that doesn't require the user to manually provide the current "
"datetime and ask for 'from' to be before it and for 'until' to be after it. "
"If the index does not have 'from' and 'until' fields, this results in an empty response.",
default=False
default=None
)


Expand Down
10 changes: 7 additions & 3 deletions graphai/core/common/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@
import imagehash
import numpy as np
from PIL import Image
import pdf2image
import pymupdf
from fuzzywuzzy import fuzz

from graphai.core.common.common_utils import file_exists, is_pdf


Image.MAX_IMAGE_PIXELS = 933120000


def perceptual_hash_text(s):
"""
Computes the perceptual hash of a strong
Expand Down Expand Up @@ -137,9 +140,10 @@ def perceptual_hash_pdf(input_filename_with_path, hash_size=16):
if not file_exists(input_filename_with_path) or not is_pdf(input_filename_with_path):
print(f'File {input_filename_with_path} does not exist or is not in the right format')
return None
pdf_imageset = pdf2image.convert_from_path(input_filename_with_path)
pdf_imageset = pymupdf.open(input_filename_with_path)
results = hashlib.md5(
''.join(str(imagehash.dhash(x, hash_size=hash_size)) for x in pdf_imageset).encode('utf8')
''.join(str(imagehash.dhash(x.get_pixmap().pil_image(),
hash_size=hash_size)) for x in pdf_imageset).encode('utf8')
).hexdigest()
return str(results)

Expand Down