Skip to main content

Document Loaders

Loaders ingest content and return a List[Document]. All loaders share the same interface.

Document schema

from synapsekit import Document

@dataclass
class Document:
text: str
metadata: dict = field(default_factory=dict)

TextLoader

Load a plain text file.

# No extra install needed
from synapsekit import TextLoader

docs = TextLoader("path/to/file.txt").load()
# docs[0].text → file contents
# docs[0].metadata → {"source": "path/to/file.txt"}

StringLoader

Wrap a raw string as a Document (useful for testing or dynamic content).

from synapsekit import StringLoader

docs = StringLoader("Your raw text here.", metadata={"source": "inline"}).load()

PDFLoader

Load a PDF file, returning one Document per page.

pip install synapsekit[pdf]
from synapsekit import PDFLoader

docs = PDFLoader("report.pdf").load()
# docs[0].metadata → {"source": "report.pdf", "page": 0}
# docs[1].metadata → {"source": "report.pdf", "page": 1}

HTMLLoader

Load an HTML file, stripping all tags to plain text.

pip install synapsekit[html]
from synapsekit import HTMLLoader

docs = HTMLLoader("page.html").load()
# docs[0].text → plain text content
# docs[0].metadata → {"source": "page.html"}

CSVLoader

Load a CSV file, one Document per row.

from synapsekit import CSVLoader

# All columns joined as text
docs = CSVLoader("data.csv").load()

# Specify a dedicated text column — remaining columns become metadata
docs = CSVLoader("data.csv", text_column="content").load()
# docs[0].text → value of "content" column
# docs[0].metadata → all other columns + {"source": "...", "row": 0}

JSONLoader

Load a JSON file (list of objects or a single object).

from synapsekit import JSONLoader

# Default: reads "text" key from each object
docs = JSONLoader("data.json").load()

# Custom text key + promote specific fields to metadata
docs = JSONLoader(
"data.json",
text_key="content",
metadata_keys=["id", "category"],
).load()

DirectoryLoader

Load all matching files in a directory. Delegates to the correct loader per file extension (.txt, .pdf, .csv, .json, .html/.htm).

from synapsekit import DirectoryLoader

# Load all files recursively (default)
docs = DirectoryLoader("./my_docs/").load()

# Custom glob pattern
docs = DirectoryLoader("./my_docs/", glob_pattern="**/*.pdf").load()

# Non-recursive
docs = DirectoryLoader("./my_docs/", glob_pattern="*.txt", recursive=False).load()

Unreadable or unsupported files are silently skipped.


WebLoader

Fetch a URL and return its text content. Strips HTML tags automatically.

pip install synapsekit[web]
import asyncio
from synapsekit import WebLoader

# Async (recommended)
docs = await WebLoader("https://example.com").load()

# Sync
docs = WebLoader("https://example.com").load_sync()

# docs[0].text → stripped page text
# docs[0].metadata → {"source": "https://example.com"}

ExcelLoader

Load an Excel (.xlsx) file, one Document per sheet. Each sheet is converted to tab-separated text.

pip install synapsekit[excel]
from synapsekit import ExcelLoader

docs = ExcelLoader("data.xlsx").load()
# docs[0].text -> tab-separated rows
# docs[0].metadata -> {"source": "data.xlsx", "sheet": "Sheet1"}

PowerPointLoader

Load a PowerPoint (.pptx) file, one Document per slide. Extracts text from all shapes.

pip install synapsekit[pptx]
from synapsekit import PowerPointLoader

docs = PowerPointLoader("presentation.pptx").load()
# docs[0].text -> text from slide 1
# docs[0].metadata -> {"source": "presentation.pptx", "slide": 0}

DocxLoader

Load a Microsoft Word (.docx) file.

pip install synapsekit[docx]
from synapsekit import DocxLoader

docs = DocxLoader("report.docx").load()
# docs[0].text → paragraph text joined by newlines
# docs[0].metadata → {"source": "report.docx"}

MarkdownLoader

Load a Markdown file. Strips YAML frontmatter by default.

# No extra install needed
from synapsekit import MarkdownLoader

docs = MarkdownLoader("README.md").load()
# docs[0].text → markdown content (frontmatter stripped)
# docs[0].metadata → {"source": "README.md"}

# Keep frontmatter
docs = MarkdownLoader("README.md", strip_frontmatter=False).load()

Loading into the RAG facade

All loaders return List[Document], which you can pass directly to add_documents():

from synapsekit import RAG, PDFLoader, DirectoryLoader

rag = RAG(model="gpt-4o-mini", api_key="sk-...")

# Single loader
rag.add_documents(PDFLoader("report.pdf").load())

# Multiple loaders
from itertools import chain
docs = list(chain(
PDFLoader("report.pdf").load(),
DirectoryLoader("./notes/").load(),
))
rag.add_documents(docs)

answer = rag.ask_sync("Summarize everything.")