Skip to main content

Load Document

from parseport.document.pdf_document import PDFDocument


from PIL import Image
import asyncio
from parseport.layout_parser.simple_layout_parser import SimpleLayoutParser
from parseport.tools.layout_detector.paddle_layout_detector import PaddleLayoutDetector
from parseport.tools.layout_detector.yolo_layout_detector import YoloLayoutDetector
from parseport.tools.vlm_generator.openai_vlm_generator import OpenAIVLMGenerator
from parseport.content_reader.vlm_reader import VLMDocumentReader
from parseport.formatter.vlm_formatter import VLMFormatter
from parseport.struct import ComponentType
Load your document containing tables.
doc = PDFDocument.from_path('tables.pdf', include_visuals=True)

Detect Table Components

Use layout detection to identify table regions in the document.
detector = PaddleLayoutDetector(device='cpu')
parser = SimpleLayoutParser(detector)
components = parser.parse_document(doc)

Visualize Results (Optional)

See how the layout detection performed on your document.
image = document.render_image_with_component_blocks(4, components) # page 5
display(Image.fromarray(image))
Image

Extract Text from Tables

Extract text from detected table components and filter for table types only.
vlm_generator = OpenAIVLMGenerator()
reader = VLMDocumentReader(vlm_generator)
components = await reader.extract_texts(doc, components, show_progress=True)

tables = [component for component in components if component.type == ComponentType.TABLE]
print(f'Found {len(tables)} tables')
number of tables: 29

Convert to CSV Format

Convert each table component into clean CSV format.
formatter = VLMFormatter(vlm_generator)
semaphore = asyncio.Semaphore(10)

async def format_table(table_component):
    async with semaphore:
        return await formatter.format_component(
            doc,
            table_component,
            "Extract the table into a CSV format. Output just CSV text, no other text. No quotes unless it's in the table."
        )

formatted_tables = await asyncio.gather(*[format_table(table) for table in tables])

Save Results

Process and display the CSV-formatted tables, ready for export or further analysis.
for i, formatted_table in enumerate(formatted_tables):
    print(f"Table {i+1}:")
    print(formatted_table)
    print()
Role,Actor
Main character,Daniel Radcliffe
Sidekick 1,Rupert Grint
Sidekick 2,Emma Watson
Lovable ogre,Robbie Coltrane
Professor,Maggie Smith
Headmaster,Richard Harris

Name,2008 Entered,2008 Completed,2009 Entered,2009 Completed
Bob,22,21,20,19
Sue,44,12,12,10

Rainfall (inches),Americas,Asia,Europe,Africa
Average,133,244,155,166
24 hour high,27,28,29,20
12 hour high,11,12,13,16
...