Documentation Index
Fetch the complete documentation index at: https://parserport.outerport.com/llms.txt
Use this file to discover all available pages before exploring further.
Load and Parse Document
from parseport.document.pdf_document import PDFDocument
from parseport.layout_parser.simple_layout_parser import SimpleLayoutParser
from parseport.tools.layout_detector.paddle_layout_detector import PaddleLayoutDetector
from parseport.tools.layout_detector.yolo_layout_detector import YoloLayoutDetector
from parseport.tools.vlm_generator.openai_vlm_generator import OpenAIVLMGenerator
from parseport.content_reader.vlm_reader import VLMDocumentReader
from parseport.formatter.vlm_formatter import VLMFormatter
Load your document with visual elements enabled.
doc = PDFDocument.from_path('scaling-law-paper.pdf', include_visuals=True)
Detect Layout Components
Identify different sections and elements in the document.
detector = YoloLayoutDetector()
parser = SimpleLayoutParser(detector)
components = parser.parse_document(doc)
Visualize Results (Optional)
See how the layout detection performed on your document.
image = document.render_image_with_component_blocks(4, components) # page 5
display(Image.fromarray(image))
Extract text from all detected components.
vlm_generator = OpenAIVLMGenerator()
reader = VLMDocumentReader(vlm_generator)
components = await reader.extract_texts(doc, components, show_progress=True)
Create Semantic Chunks
Split text into token-aware chunks that respect document structure by keeping related components together.
from tiktoken import encoding_for_model
encoding = encoding_for_model("gpt-4o")
chunk_size = 200
chunks = []
current_chunk = ""
for component in components:
if hasattr(component, 'text') and component.text:
# Check if adding this text would exceed chunk_size
token_count = len(encoding.encode(current_chunk + component.text))
if token_count > chunk_size:
# Save current chunk and start new one
if current_chunk:
chunks.append(current_chunk)
current_chunk = component.text
else:
current_chunk += component.text
if current_chunk:
chunks.append(current_chunk)
print(f"Created {len(chunks)} chunks")