Load Document
Detect Table Components
Use layout detection to identify table regions in the document.Visualize Results (Optional)
See how the layout detection performed on your document.
Extract table data from documents and format as CSV for further processing.
from parseport.document.pdf_document import PDFDocument
from PIL import Image
import asyncio
from parseport.layout_parser.simple_layout_parser import SimpleLayoutParser
from parseport.tools.layout_detector.paddle_layout_detector import PaddleLayoutDetector
from parseport.tools.layout_detector.yolo_layout_detector import YoloLayoutDetector
from parseport.tools.vlm_generator.openai_vlm_generator import OpenAIVLMGenerator
from parseport.content_reader.vlm_reader import VLMDocumentReader
from parseport.formatter.vlm_formatter import VLMFormatter
from parseport.struct import ComponentType
doc = PDFDocument.from_path('tables.pdf', include_visuals=True)
detector = PaddleLayoutDetector(device='cpu')
parser = SimpleLayoutParser(detector)
components = parser.parse_document(doc)
image = document.render_image_with_component_blocks(4, components) # page 5
display(Image.fromarray(image))

vlm_generator = OpenAIVLMGenerator()
reader = VLMDocumentReader(vlm_generator)
components = await reader.extract_texts(doc, components, show_progress=True)
tables = [component for component in components if component.type == ComponentType.TABLE]
print(f'Found {len(tables)} tables')
number of tables: 29
formatter = VLMFormatter(vlm_generator)
semaphore = asyncio.Semaphore(10)
async def format_table(table_component):
async with semaphore:
return await formatter.format_component(
doc,
table_component,
"Extract the table into a CSV format. Output just CSV text, no other text. No quotes unless it's in the table."
)
formatted_tables = await asyncio.gather(*[format_table(table) for table in tables])
for i, formatted_table in enumerate(formatted_tables):
print(f"Table {i+1}:")
print(formatted_table)
print()
Role,Actor
Main character,Daniel Radcliffe
Sidekick 1,Rupert Grint
Sidekick 2,Emma Watson
Lovable ogre,Robbie Coltrane
Professor,Maggie Smith
Headmaster,Richard Harris
Name,2008 Entered,2008 Completed,2009 Entered,2009 Completed
Bob,22,21,20,19
Sue,44,12,12,10
Rainfall (inches),Americas,Asia,Europe,Africa
Average,133,244,155,166
24 hour high,27,28,29,20
12 hour high,11,12,13,16
...