From 0287b68de0cccc4da06fca48aaeafeea8151d891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20G=C3=B6pfert?= <94385965+jangoepfert@users.noreply.github.com> Date: Fri, 8 Dec 2023 11:47:52 +0100 Subject: [PATCH] Added option to directly pass the PDF as bytes to process_pdf instead of loading the PDF from disk --- grobid_client/grobid_client.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index b63cf20..61fe167 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -243,9 +243,18 @@ def process_pdf( include_raw_citations, include_raw_affiliations, tei_coordinates, - segment_sentences + segment_sentences, + from_memory=False ): - pdf_handle = open(pdf_file, "rb") + if from_memory: + # PDF already loaded into memory + # expects pdf_file to be of type 'bytes' + pdf_handle = io.BytesIO(pdf_file) + pdf_file = "" + else: + # expects pdf_file to be path to PDF file + pdf_handle = open(pdf_file, "rb") + files = { "input": ( pdf_file,