-
Notifications
You must be signed in to change notification settings - Fork 61
Description
Hi Team,
I am using layoutparser for detecting tables and images.
When I just try to run code on individual png image file, model detects tables and figures correctly.
However, when I am using below code to convert pdf into images and detecting tables out of each page image, I am either not getting full image/table or sometimes get duplicates tables as well.
Can you please guide on how to refine below code and what I can try to resolve this issue? Thank you!
!pip install layoutparser
!pip install opencv-python numpy matplotlib
install detectron2:
!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip3 install pdf2image
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!apt-get install poppler-utils
import os
from pdf2image import convert_from_path
import shutil
import cv2
import layoutparser as lp
PubLayNet
model = lp.models.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.81],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})
def save_detections(table_blocks, image, image_name, save_dir='/content/'):
for j in range(len(table_blocks)):
x_1, y_1, x_2, y_2 = table_blocks[0].block.x_1, table_blocks[0].block.y_1, table_blocks[0].block.x_2, table_blocks[0].block.y_2
cropped = image[int(y_1):int(y_2), int(x_1):int(x_2)]
cv2_imshow(cropped)
file_name = image_name+'_'+str(j)+'.jpg'
status = cv2.imwrite(save_dir+file_name, cropped)
if status:
print("Saved ", file_name)
def inference(images_dir):
table_blocks_list = []
# Getting images from the directory
for file in os.listdir(images_dir):
if file.endswith(".jpg"):
# Extract the image name (excluding the extension)
image_name = file[:-4]
# # Reading the image using OpenCV
image = cv2.imread(images_dir+'/'+file)
# OpenCV reads images in BGR format, convert to RGB
image = image[..., ::-1]
# Running Inference
layout = model.detect(image)
# Extracting Tables
table_blocks = lp.Layout([b for b in layout if b.type=="Table"])
figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])
table_blocks = lp.Layout([b for b in table_blocks \
if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
h, w = image.shape[:2]
left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)
left_blocks = table_blocks.filter_by(left_interval, center=True)
left_blocks.sort(key = lambda b:b.coordinates[1])
right_blocks = [b for b in table_blocks if b not in left_blocks]
right_blocks.sort(key = lambda b:b.coordinates[1])
# And finally combine the two list and add the index
# according to the order
table_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])
save_detections(table_blocks, image, image_name)
table_blocks_list.append(table_blocks)
return table_blocks_list
def pdf_inference(pdfName):
# Converting each page to an image
# Get the current working directory
path = os.getcwd()
# Construct the full path to the PDF file
PDF_file = path + "/" + pdfName
# Create a directory to store converted images
if os.path.exists(path+'/pdf_images'):
shutil.rmtree(path+'/pdf_images')
os.mkdir(path+'/pdf_images')
# Convert each page of the PDF to an image
pages = convert_from_path(PDF_file, dpi=100, grayscale=True)
image_counter = 1
# Iterate over the pages
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
# st.write(filename)
filepath = path+"/pdf_images/" + filename
# Save the page as a JPEG image in the 'pdf_images' directory
page.save(f'{filepath}', 'JPEG')
image_counter = image_counter + 1
#filelimit = image_counter-1
# Running inference on the images
table_blocks_list = inference(path+'/pdf_images')
#return table_blocks_list
test = pdf_inference('abc-Datasheet.pdf')
Thanks
Reema Jain