Skip to main content

Load and Parse Document

from parseport.document.pdf_document import PDFDocument
from parseport.layout_parser.simple_layout_parser import SimpleLayoutParser
from parseport.tools.layout_detector.paddle_layout_detector import PaddleLayoutDetector
from parseport.tools.layout_detector.yolo_layout_detector import YoloLayoutDetector
from parseport.tools.vlm_generator.openai_vlm_generator import OpenAIVLMGenerator
from parseport.content_reader.vlm_reader import VLMDocumentReader
from parseport.formatter.vlm_formatter import VLMFormatter
Load your document with visual elements enabled.
doc = PDFDocument.from_path('scaling-law-paper.pdf', include_visuals=True)

Detect Layout Components

Identify different sections and elements in the document.
detector = YoloLayoutDetector()
parser = SimpleLayoutParser(detector)
components = parser.parse_document(doc)

Visualize Results (Optional)

See how the layout detection performed on your document.
image = document.render_image_with_component_blocks(4, components) # page 5
display(Image.fromarray(image))
Image

Extract Text

Extract text from all detected components.
vlm_generator = OpenAIVLMGenerator()
reader = VLMDocumentReader(vlm_generator)
components = await reader.extract_texts(doc, components, show_progress=True)

Create Semantic Chunks

Split text into token-aware chunks that respect document structure by keeping related components together.
from tiktoken import encoding_for_model

encoding = encoding_for_model("gpt-4o")
chunk_size = 200
chunks = []
current_chunk = ""

for component in components:
    if hasattr(component, 'text') and component.text:
        # Check if adding this text would exceed chunk_size
        token_count = len(encoding.encode(current_chunk + component.text))
        if token_count > chunk_size:
            # Save current chunk and start new one
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = component.text
        else:
            current_chunk += component.text

if current_chunk:
    chunks.append(current_chunk)

print(f"Created {len(chunks)} chunks")
Created 163 chunks