Skip to main content

Load the Document

from parseport.document.pdf_document import PDFDocument
from parseport.layout_parser.simple_layout_parser import SimpleLayoutParser
from parseport.tools.layout_detector.paddle_layout_detector import PaddleLayoutDetector
from parseport.tools.layout_detector.yolo_layout_detector import YoloLayoutDetector
from parseport.tools.vlm_generator.openai_vlm_generator import OpenAIVLMGenerator
from parseport.content_reader.vlm_reader import VLMDocumentReader
from parseport.formatter.vlm_formatter import VLMFormatter
Load your PDF receipt with visual elements enabled for better processing.
doc = PDFDocument.from_path('receipt.pdf', include_visuals=True)

Detect Layout Components

Identify different sections and elements in the receipt (text blocks, tables, etc.).
detector = YoloLayoutDetector()
parser = SimpleLayoutParser(detector)
components = parser.parse_document(doc)

Visualize Results (Optional)

See how the layout detection performed on your document.
image = document.render_image_with_component_blocks(0, components) # page 2
display(Image.fromarray(image))
Image

Extract Text

vlm_generator = OpenAIVLMGenerator()
reader = VLMDocumentReader(vlm_generator)
components = await reader.extract_texts(doc, components, show_progress=True)
Extract text from the detected components.

Format Specific Data

Structure the extracted text into your desired format (JSON, table, etc.).
formatter = VLMFormatter(vlm_generator)
formatted_texts = await formatter.format_text(
    doc, components,
    "Extract the prices of the items in the receipt in a JSON format. Output the JSON only, no other text."
)

Get Results

for page_num, text in formatted_texts.items():
    print(text)
    print("\n")
[
  {
    "description": "CLEARANCE! Fast Dell Desktop Computer PC DUAL CORE WINDOWS 10 4/8/16GB RAM",
    "net_price": 209.00,
    "net_worth": 627.00,
    "gross_worth": 689.70
  },
  {
    "description": "HP T520 Thin Client Computer AMD GX-212JC 1.2GHz 4GB RAM TESTED !!READ BELOW!!",
    "net_price": 37.75,
    "net_worth": 188.75,
    "gross_worth": 207.63
  },
 ...
]