Load and Parse Document
Detect Layout Components
Identify different sections and elements in the document.Visualize Results (Optional)
See how the layout detection performed on your document.
Create semantically meaningful chunks for document before storing them into vector DB
from parseport.document.pdf_document import PDFDocument
from parseport.layout_parser.simple_layout_parser import SimpleLayoutParser
from parseport.tools.layout_detector.paddle_layout_detector import PaddleLayoutDetector
from parseport.tools.layout_detector.yolo_layout_detector import YoloLayoutDetector
from parseport.tools.vlm_generator.openai_vlm_generator import OpenAIVLMGenerator
from parseport.content_reader.vlm_reader import VLMDocumentReader
from parseport.formatter.vlm_formatter import VLMFormatter
doc = PDFDocument.from_path('scaling-law-paper.pdf', include_visuals=True)
detector = YoloLayoutDetector()
parser = SimpleLayoutParser(detector)
components = parser.parse_document(doc)
image = document.render_image_with_component_blocks(4, components) # page 5
display(Image.fromarray(image))

vlm_generator = OpenAIVLMGenerator()
reader = VLMDocumentReader(vlm_generator)
components = await reader.extract_texts(doc, components, show_progress=True)
from tiktoken import encoding_for_model
encoding = encoding_for_model("gpt-4o")
chunk_size = 200
chunks = []
current_chunk = ""
for component in components:
if hasattr(component, 'text') and component.text:
# Check if adding this text would exceed chunk_size
token_count = len(encoding.encode(current_chunk + component.text))
if token_count > chunk_size:
# Save current chunk and start new one
if current_chunk:
chunks.append(current_chunk)
current_chunk = component.text
else:
current_chunk += component.text
if current_chunk:
chunks.append(current_chunk)
print(f"Created {len(chunks)} chunks")
Created 163 chunks