Skip to main content

Document Loaders

Loaders ingest content and return a List[Document]. All loaders share the same interface.

Document schema

from synapsekit import Document

@dataclass
class Document:
text: str
metadata: dict = field(default_factory=dict)

TextLoader

Load a plain text file.

# No extra install needed
from synapsekit import TextLoader

docs = TextLoader("path/to/file.txt").load()
# docs[0].text → file contents
# docs[0].metadata → {"source": "path/to/file.txt"}

StringLoader

Wrap a raw string as a Document (useful for testing or dynamic content).

from synapsekit import StringLoader

docs = StringLoader("Your raw text here.", metadata={"source": "inline"}).load()

PDFLoader

Load a PDF file, returning one Document per page.

pip install synapsekit[pdf]
from synapsekit import PDFLoader

docs = PDFLoader("report.pdf").load()
# docs[0].metadata → {"source": "report.pdf", "page": 0}
# docs[1].metadata → {"source": "report.pdf", "page": 1}

HTMLLoader

Load an HTML file, stripping all tags to plain text.

pip install synapsekit[html]
from synapsekit import HTMLLoader

docs = HTMLLoader("page.html").load()
# docs[0].text → plain text content
# docs[0].metadata → {"source": "page.html"}

CSVLoader

Load a CSV file, one Document per row.

from synapsekit import CSVLoader

# All columns joined as text
docs = CSVLoader("data.csv").load()

# Specify a dedicated text column — remaining columns become metadata
docs = CSVLoader("data.csv", text_column="content").load()
# docs[0].text → value of "content" column
# docs[0].metadata → all other columns + {"source": "...", "row": 0}

JSONLoader

Load a JSON file (list of objects or a single object).

from synapsekit import JSONLoader

# Default: reads "text" key from each object
docs = JSONLoader("data.json").load()

# Custom text key + promote specific fields to metadata
docs = JSONLoader(
"data.json",
text_key="content",
metadata_keys=["id", "category"],
).load()

DirectoryLoader

Load all matching files in a directory. Delegates to the correct loader per file extension (.txt, .pdf, .csv, .json, .html/.htm).

from synapsekit import DirectoryLoader

# Load all files recursively (default)
docs = DirectoryLoader("./my_docs/").load()

# Custom glob pattern
docs = DirectoryLoader("./my_docs/", glob_pattern="**/*.pdf").load()

# Non-recursive
docs = DirectoryLoader("./my_docs/", glob_pattern="*.txt", recursive=False).load()

Unreadable or unsupported files are silently skipped.


WebLoader

Fetch a URL and return its text content. Strips HTML tags automatically.

pip install synapsekit[web]
import asyncio
from synapsekit import WebLoader

# Async (recommended)
docs = await WebLoader("https://example.com").load()

# Sync
docs = WebLoader("https://example.com").load_sync()

# docs[0].text → stripped page text
# docs[0].metadata → {"source": "https://example.com"}

ExcelLoader

Load an Excel (.xlsx) file, one Document per sheet. Each sheet is converted to tab-separated text.

pip install synapsekit[excel]
from synapsekit import ExcelLoader

docs = ExcelLoader("data.xlsx").load()
# docs[0].text -> tab-separated rows
# docs[0].metadata -> {"source": "data.xlsx", "sheet": "Sheet1"}

PowerPointLoader

Load a PowerPoint (.pptx) file, one Document per slide. Extracts text from all shapes.

pip install synapsekit[pptx]
from synapsekit import PowerPointLoader

docs = PowerPointLoader("presentation.pptx").load()
# docs[0].text -> text from slide 1
# docs[0].metadata -> {"source": "presentation.pptx", "slide": 0}

DocxLoader

Load a Microsoft Word (.docx) file.

pip install synapsekit[docx]
from synapsekit import DocxLoader

docs = DocxLoader("report.docx").load()
# docs[0].text → paragraph text joined by newlines
# docs[0].metadata → {"source": "report.docx"}

MarkdownLoader

Load a Markdown file. Strips YAML frontmatter by default.

# No extra install needed
from synapsekit import MarkdownLoader

docs = MarkdownLoader("README.md").load()
# docs[0].text → markdown content (frontmatter stripped)
# docs[0].metadata → {"source": "README.md"}

# Keep frontmatter
docs = MarkdownLoader("README.md", strip_frontmatter=False).load()

AudioLoader

Transcribe audio files into Documents using OpenAI Whisper API or local Whisper.

pip install synapsekit[audio]
from synapsekit import AudioLoader

# Using Whisper API (default)
docs = AudioLoader("interview.mp3", api_key="sk-...").load()
# docs[0].text → transcribed text
# docs[0].metadata → {"source": "interview.mp3", "loader": "AudioLoader", "backend": "whisper_api"}

# Using local Whisper
docs = AudioLoader("interview.mp3", backend="whisper_local").load()

# Async
docs = await AudioLoader("interview.mp3", api_key="sk-...").aload()

Supported formats: .mp3, .wav, .m4a, .ogg, .flac, .webm

ParameterTypeDefaultDescription
pathstrrequiredPath to audio file
api_keystr | NoneNoneOpenAI API key (for whisper_api)
backendstr"whisper_api""whisper_api" or "whisper_local"
languagestr | NoneNoneLanguage hint
modelstr"whisper-1"Whisper model name

VideoLoader

Extract audio from video files via ffmpeg, then transcribe using AudioLoader.

pip install synapsekit[video]
# Requires ffmpeg installed on your system
from synapsekit import VideoLoader

docs = VideoLoader("lecture.mp4", api_key="sk-...").load()
# docs[0].text → transcribed speech
# docs[0].metadata → {"source": "lecture.mp4", "loader": "VideoLoader", "backend": "whisper_api"}

# Async
docs = await VideoLoader("lecture.mp4", api_key="sk-...").aload()

Supported formats: .mp4, .mov, .avi, .mkv, .webm, .m4v

ParameterTypeDefaultDescription
pathstrrequiredPath to video file
api_keystr | NoneNoneOpenAI API key
backendstr"whisper_api"Whisper backend
languagestr | NoneNoneLanguage hint
keep_audioboolFalseKeep extracted audio file

YAMLLoader

Load YAML files (list-of-objects or single-object) into Documents.

pip install synapsekit[yaml]
from synapsekit import YAMLLoader

# List of objects (each becomes a Document)
docs = YAMLLoader("data.yaml").load()

# Single object YAML
docs = YAMLLoader("config.yaml").load()

# Custom key extraction
docs = YAMLLoader("data.yaml", text_key="content", metadata_keys=["title", "author"]).load()

# Async
docs = await YAMLLoader("data.yaml").aload()
ParameterTypeDefaultDescription
pathstrrequiredPath to YAML file
text_keystr"text"Key to extract as document text
metadata_keyslist[str][]Keys to include in metadata

XMLLoader

Load XML files and extract text content using Python's built-in xml.etree.ElementTree — no extra dependencies.

from synapsekit import XMLLoader

# Load all text from an XML file
docs = XMLLoader("feed.xml").load()

# Extract only specific tags
docs = XMLLoader("article.xml", tags=["title", "body", "summary"]).load()

# Custom encoding
docs = XMLLoader("data.xml", encoding="latin-1").load()
ParameterTypeDefaultDescription
pathstrrequiredPath to XML file
tagslist[str] | NoneNoneTag names to extract (all text if omitted)
encodingstr"utf-8"File encoding

DiscordLoader

Load messages from Discord channels using the Discord bot API.

pip install synapsekit[discord]
from synapsekit import DiscordLoader

# Load last 100 messages from a channel
loader = DiscordLoader(
token="your-bot-token",
channel_id=1234567890123456789,
)
docs = loader.load() # synchronous

# or async
docs = await loader.aload()

# Paginate with before/after message IDs
docs = DiscordLoader(
token="your-bot-token",
channel_id=1234567890123456789,
limit=50,
before_message_id=9876543210,
after_message_id=1111111111,
).load()

# Exclude metadata (text only)
docs = DiscordLoader(
token="your-bot-token",
channel_id=1234567890123456789,
include_metadata=False,
).load()

Each message becomes one Document. Metadata includes author, message ID, channel ID, timestamp, attachments, and reactions.

ParameterTypeDefaultDescription
tokenstrrequiredDiscord bot token
channel_idintrequiredChannel ID to load from
limitint100Maximum messages to fetch
before_message_idint | NoneNoneFetch messages before this ID
after_message_idint | NoneNoneFetch messages after this ID
include_metadataboolTrueInclude author, timestamp, reactions etc. in metadata
info

The bot must have Read Message History permission and Message Content Intent enabled in the Discord Developer Portal.


SlackLoader

Load messages from Slack channels using the Slack Bot API.

pip install synapsekit[slack]
from synapsekit import SlackLoader

# Load all messages from a channel
loader = SlackLoader(
bot_token="xoxb-...",
channel_id="C1234567890",
)
docs = loader.load() # synchronous
docs = await loader.aload() # async

# Limit the number of messages fetched
loader = SlackLoader(
bot_token="xoxb-...",
channel_id="C1234567890",
limit=200,
)
docs = loader.load()

Each message becomes one Document. Metadata includes source (channel ID), ts (timestamp), user, and thread_ts.

ParameterTypeDefaultDescription
bot_tokenstrrequiredSlack Bot OAuth token (xoxb-...)
channel_idstrrequiredChannel ID to load from (e.g. C1234567890)
limitint | NoneNoneMaximum messages to fetch (fetches all if not set)
info

The bot must have the channels:history (public) or groups:history (private) OAuth scope, and must be a member of the channel.


NotionLoader

Load pages or databases from Notion using the Notion API.

pip install synapsekit[notion]
from synapsekit import NotionLoader

# Load a single page by ID
loader = NotionLoader(
api_key="secret_...",
page_id="abc12345-...",
)
docs = loader.load() # synchronous
docs = await loader.aload() # async

# Load all pages from a database
loader = NotionLoader(
api_key="secret_...",
database_id="def67890-...",
)
docs = loader.load()

Each page becomes one Document. Metadata includes source (page URL), page_id, and title.

ParameterTypeDefaultDescription
api_keystrrequiredNotion integration token (secret_...)
page_idstr | NoneNoneLoad a single Notion page by ID
database_idstr | NoneNoneLoad all pages from a database
max_retriesint3Retry attempts for transient API errors
timeoutfloat30.0HTTP request timeout in seconds

Exactly one of page_id or database_id is required.

info

Create an internal integration at notion.so/my-integrations, then share the target page or database with that integration.


WikipediaLoader

Load Wikipedia articles as Documents. Accepts a single article title or multiple pipe-delimited titles.

pip install synapsekit[wikipedia]
from synapsekit import WikipediaLoader

# Single article
loader = WikipediaLoader(query="Python (programming language)")
docs = loader.load()
# docs[0].text → full article text
# docs[0].metadata → {"source": "wikipedia", "title": "...", "url": "...", "language": "en"}

# Multiple articles (pipe-delimited)
loader = WikipediaLoader(query="RAG | Vector database | Embeddings", max_results=3)
docs = loader.load()

# Async
docs = await loader.aload()
ParameterTypeDefaultDescription
querystrrequiredArticle title(s), pipe-separated for multiple
languagestr"en"Wikipedia language code
max_resultsint3Maximum articles to return

ArXivLoader

Search arXiv and load papers as Documents (downloads PDFs and extracts text).

pip install synapsekit[arxiv,pdf]
from synapsekit import ArXivLoader

loader = ArXivLoader(
query="retrieval augmented generation",
max_results=5,
sort_by="relevance", # "relevance" | "lastUpdatedDate" | "submittedDate"
)
docs = loader.load()
# docs[0].text → full paper text
# docs[0].metadata → {"source": "arxiv", "title": "...", "authors": [...], "arxiv_id": "...", "url": "..."}

# Async
docs = await loader.aload()
ParameterTypeDefaultDescription
querystrrequiredSearch query
max_resultsint5Max papers to fetch
sort_bystr"relevance"Sort order: "relevance", "lastUpdatedDate", "submittedDate"

EmailLoader

Load emails from an IMAP mailbox (Gmail, Outlook, etc.) as Documents. Uses stdlib only — no extra dependencies.

# No extra install needed
from synapsekit import EmailLoader

loader = EmailLoader(
imap_server="imap.gmail.com",
email_address="user@gmail.com",
password="app_password", # use an App Password for Gmail
folder="INBOX",
search='SINCE "01-Jan-2024"', # standard IMAP search syntax
limit=50,
)
docs = loader.load()
# docs[0].text → email body (plain text)
# docs[0].metadata → {"source": "email", "subject": "...", "from": "...", "date": "...", "folder": "INBOX", "email_id": "..."}

# Async
docs = await loader.aload()
ParameterTypeDefaultDescription
imap_serverstrrequiredIMAP hostname, e.g. imap.gmail.com
email_addressstrrequiredEmail address to log in as
passwordstrrequiredPassword or App Password
folderstr"INBOX"Mailbox folder to read
searchstr"ALL"IMAP search query (e.g. 'SINCE "01-Jan-2024"', "UNSEEN")
limitint|NoneNoneMax emails to load (most recent first)

GoogleDriveLoader

Load files and folders from Google Drive using the Drive API v3. Supports Google Docs (exported as plain text), Google Sheets (exported as CSV), PDFs, and other text files.

pip install synapsekit[gdrive]

Requires a service account with the Drive API enabled and read access to the target files or folders.

from synapsekit import GoogleDriveLoader

# Load a single file by ID
loader = GoogleDriveLoader(
credentials_path="service-account.json",
file_id="1abc...",
)
docs = loader.load()

# Or async
docs = await loader.aload()

# Load all files from a folder
loader = GoogleDriveLoader(
credentials_path="service-account.json",
folder_id="1def...",
)
docs = loader.load()

# Pass credentials as a dict (e.g., from env var JSON)
import json, os
loader = GoogleDriveLoader(
credentials_dict=json.loads(os.environ["GDRIVE_CREDS"]),
file_id="1abc...",
)
docs = loader.load()

Each file becomes one Document. Metadata includes source, file_name, mime_type, modified, and file_id. Subfolders are skipped when loading a folder. Files that fail to download are skipped with a warning.

ParameterTypeDefaultDescription
credentials_pathstr | NoneNonePath to service account JSON file
credentials_dictdict | NoneNoneService account credentials as a dict
file_idstr | NoneNoneID of a single file to load
folder_idstr | NoneNoneID of a folder — loads all files inside

Either credentials_path or credentials_dict is required (not both). Either file_id or folder_id is required (not both).

Supported MIME types
  • Google Docs (application/vnd.google-apps.document) — exported as plain text
  • Google Sheets (application/vnd.google-apps.spreadsheet) — exported as CSV
  • PDFs and text files — downloaded directly
  • Other binary files — returned as [Binary file: {mime_type}]

ImageLoader

Load images as Documents. Without a vision LLM, returns a metadata-only placeholder. With a vision LLM (any object with an async generate method), returns the LLM's description of the image.

# No extra install needed — stdlib only
from synapsekit import ImageLoader

# Without LLM — metadata placeholder
loader = ImageLoader("path/to/photo.jpg")
docs = loader.load()
# docs[0].text → "[Image: path/to/photo.jpg]"
# docs[0].metadata → {"source": "...", "media_type": "image/jpeg", "file_size": 102400}

# With vision LLM — async_load() returns a description
from synapsekit.llm.openai import OpenAILLM
from synapsekit.llm.base import LLMConfig

llm = OpenAILLM(LLMConfig(model="gpt-4o", api_key="sk-..."))
loader = ImageLoader(
"path/to/diagram.png",
llm=llm,
prompt="Describe this diagram in detail, including any text visible.",
)
docs = await loader.async_load()
# docs[0].text → "The diagram shows a RAG pipeline with ..."
# docs[0].metadata → {"source": "...", "media_type": "image/png", "file_size": ..., "description_prompt": "..."}
ParameterTypeDefaultDescription
pathstr|PathrequiredPath to the image file
llmAny|NoneNoneVision LLM with async generate(prompt, image_url=...)
promptstr"Describe this image in detail."Prompt sent to the vision LLM
Async vs sync
  • load() — always returns a placeholder [Image: <path>] regardless of LLM
  • async_load() — uses the LLM to generate a real description (requires a vision model like gpt-4o)

ConfluenceLoader

Load pages from Atlassian Confluence as Documents. Supports loading a single page by ID or an entire space, with automatic pagination and retry on rate limits.

pip install synapsekit[confluence]
from synapsekit import ConfluenceLoader

# Load a single page by ID
loader = ConfluenceLoader(
url="https://yourcompany.atlassian.net/wiki",
username="you@example.com",
api_token="your-api-token",
page_id="123456789",
)
docs = loader.load() # synchronous
docs = await loader.aload() # async

# Load all pages in a space
loader = ConfluenceLoader(
url="https://yourcompany.atlassian.net/wiki",
username="you@example.com",
api_token="your-api-token",
space_key="ENG",
limit=50, # optional cap
)
docs = loader.load()

Each page becomes one Document. Metadata includes source, title, page_id, space, url, version, author, and last_modified.

ParameterTypeDefaultDescription
urlstrrequiredBase URL of your Confluence instance
usernamestrrequiredYour Atlassian email address
api_tokenstrrequiredAtlassian API token
page_idstr | NoneNoneLoad a single page by its ID
space_keystr | NoneNoneLoad all pages from a Confluence space
limitint | NoneNoneMax pages to load when fetching a space

Exactly one of page_id or space_key is required.

info

Generate an API token at id.atlassian.com/manage-profile/security/api-tokens. Rate-limit errors (HTTP 429) are retried with exponential back-off automatically.


RSSLoader

Load articles from RSS or Atom feeds as Documents.

pip install synapsekit[rss]
from synapsekit import RSSLoader

loader = RSSLoader("https://feeds.feedburner.com/oreilly/radar")
docs = loader.load() # synchronous
docs = await loader.aload() # async

# docs[0].text → article body (full content if available, summary otherwise)
# docs[0].metadata → {"title": "...", "published": "...", "link": "...", "author": "..."}

Each feed entry becomes one Document. Metadata fields (title, published, link, author) are omitted when empty.

ParameterTypeDescription
urlstrURL of the RSS or Atom feed

GCSLoader

Load files from a Google Cloud Storage bucket as Documents. Install with pip install synapsekit[gcs].

from synapsekit import GCSLoader

loader = GCSLoader(
bucket_name="my-bucket",
prefix="documents/",
credentials_path="service-account.json",
max_files=100,
)

docs = await loader.aload()
ParameterTypeDescription
bucket_namestrGCS bucket name (required)
prefixstr | NoneOptional prefix filter (e.g. "documents/")
credentials_pathstr | NonePath to a service account JSON file
credentials_dictdict | NoneService account credentials as a dict
max_filesint | NoneMaximum number of files to load

If neither credentials_path nor credentials_dict is provided, default application credentials are used. Binary files are kept with a placeholder string and their content type in metadata.


SQLLoader

Load rows from any SQLAlchemy-supported database (PostgreSQL, MySQL, SQLite, etc.) as Documents. Install with pip install synapsekit[sql].

from synapsekit import SQLLoader

loader = SQLLoader(
connection_string="postgresql://user:pass@localhost/db",
query="SELECT id, title, body, author FROM articles WHERE published = true",
text_columns=["title", "body"],
metadata_columns=["id", "author"],
)

docs = await loader.aload()
ParameterTypeDescription
connection_stringstrSQLAlchemy database URL (required)
querystrSQL query to execute (required)
text_columnslist[str] | NoneColumns concatenated into the document text. Defaults to all columns.
metadata_columnslist[str] | NoneColumns included in metadata. Defaults to all columns.

Each Document gets metadata["source"] = "sql" and metadata["row_index"] automatically.


GitHubLoader

Load README, issues, pull requests, or repository files from a GitHub repository via the REST API. Uses the existing httpx dependency — no new install needed if you already have synapsekit[web].

from synapsekit import GitHubLoader

# README
loader = GitHubLoader(repo="SynapseKit/SynapseKit", content_type="readme")

# Issues (filters out PRs automatically)
loader = GitHubLoader(repo="SynapseKit/SynapseKit", content_type="issues", limit=20)

# Pull requests
loader = GitHubLoader(repo="SynapseKit/SynapseKit", content_type="prs", limit=10)

# Repository files (recursive Git Trees API)
loader = GitHubLoader(
repo="SynapseKit/SynapseKit",
content_type="files",
path="src/synapsekit/llm/",
limit=50,
token="ghp_...", # optional but recommended for higher rate limits
)

docs = await loader.load()
ParameterTypeDescription
repostrRepository in owner/repo format (required)
content_type"readme" | "issues" | "prs" | "files"What to load. Defaults to "readme".
tokenstr | NoneGitHub token for higher rate limits
pathstr | NonePath prefix filter (only for files)
limitint | NoneMaximum number of items to load

Includes retry with exponential back-off for rate limits (HTTP 429) and 5xx errors.


GitLoader

Load files from a Git repository — local path or remote URL — at any revision. Supports glob pattern filtering.

pip install synapsekit[git]
from synapsekit import GitLoader

# Local repo, all files at HEAD
loader = GitLoader("/path/to/repo")

# Remote repo, specific revision, only Python files
loader = GitLoader(
repo="https://github.com/org/repo.git",
revision="v2.0.0",
glob_pattern="**/*.py",
)

docs = loader.load()
# or
docs = await loader.aload()
ParameterTypeDefaultDescription
repostrrequiredLocal path or remote URL
revisionstr"HEAD"Git revision (branch, tag, commit hash)
glob_patternstr"**/*"Glob filter for file paths

Each document's metadata includes path, commit_hash, author, and date.


GoogleSheetsLoader

Load rows from a Google Sheets spreadsheet as Documents. Each row becomes one document; headers become field names.

pip install synapsekit[gsheets]
from synapsekit import GoogleSheetsLoader

loader = GoogleSheetsLoader(
spreadsheet_id="1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgVE2upms",
sheet_name="Sheet1", # optional — auto-detects first sheet
credentials_path="credentials.json",
)

docs = loader.load()
# or
docs = await loader.aload()
ParameterTypeDefaultDescription
spreadsheet_idstrrequiredGoogle Sheets document ID from the URL
sheet_namestr | NoneNoneSheet tab name; first sheet used if omitted
credentials_pathstr"credentials.json"Path to service account credentials file

Row text format: "ColumnA: value, ColumnB: value, ...". Metadata includes source URL, sheet, and row index.


JiraLoader

Load Jira issues using a JQL query. Handles Atlassian Document Format (ADF) descriptions, pagination, and rate-limit retry automatically.

pip install synapsekit[jira]
from synapsekit import JiraLoader

loader = JiraLoader(
url="https://your-domain.atlassian.net",
username="your-email@example.com",
api_token="your-api-token",
jql="project = MYPROJ AND status = Open",
limit=100, # optional
)

# Async (recommended)
docs = await loader.aload()

# Sync
docs = loader.load()
ParameterTypeDefaultDescription
urlstrrequiredJira instance base URL
usernamestrrequiredJira account email
api_tokenstrrequiredJira API token
jqlstrrequiredJQL query string
limitint | NoneNoneMaximum number of issues to load

Each document includes the issue summary, description, and comments. Metadata includes key, status, assignee, priority, and source.


SupabaseLoader

Load rows from a Supabase table as Documents. Supports column selection and environment variable auth.

pip install synapsekit[supabase]
from synapsekit import SupabaseLoader

# All columns, credentials from env vars (SUPABASE_URL, SUPABASE_KEY)
loader = SupabaseLoader(table="articles")

# Specific text and metadata columns
loader = SupabaseLoader(
table="articles",
supabase_url="https://xyz.supabase.co",
supabase_key="your-anon-key",
text_columns=["title", "content"],
metadata_columns=["id", "author", "created_at"],
)

docs = loader.load()
# or
docs = await loader.aload()
ParameterTypeDefaultDescription
tablestrrequiredSupabase table name
supabase_urlstr | NoneSUPABASE_URL envSupabase project URL
supabase_keystr | NoneSUPABASE_KEY envSupabase anon/service key
text_columnslist[str] | NoneNoneColumns to include in document text; all columns used if omitted
metadata_columnslist[str] | NoneNoneColumns to include in metadata

TeamsLoader

Load messages from Microsoft Teams channels via the Microsoft Graph API.

pip install synapsekit[teams]
from synapsekit.loaders import TeamsLoader

loader = TeamsLoader(
team_id="your-team-id",
channel_id="your-channel-id",
access_token="Bearer ...",
max_messages=500,
)
docs = loader.load()
# Each doc: message body as text, metadata includes author, timestamp, channel_id
  • Automatic pagination via @odata.nextLink
  • HTML-to-plain-text conversion
  • Exponential backoff for 429 / 5xx responses

S3Loader

Load files from Amazon S3 buckets.

pip install synapsekit[s3]
from synapsekit.loaders import S3Loader

loader = S3Loader(
bucket="my-bucket",
prefix="docs/", # optional key prefix
extensions=[".pdf", ".txt"], # optional filter
max_files=100,
aws_access_key_id="AKI...",
aws_secret_access_key="...",
)
docs = loader.load()

Supports text files directly and PDF / DOCX / XLSX / PPTX / CSV / JSON / HTML via the corresponding loaders. Falls back to raw binary read for unknown types. Uses ambient IAM role if no explicit credentials are given.


AzureBlobLoader

Load blobs from Azure Blob Storage containers.

pip install synapsekit[azure]
from synapsekit.loaders import AzureBlobLoader

loader = AzureBlobLoader(
container="my-container",
connection_string="DefaultEndpointsProtocol=https;...",
prefix="reports/",
max_files=50,
)
docs = loader.load()

Also accepts account_url + credential for token-based auth.


MongoDBLoader

Load documents from a MongoDB collection.

pip install synapsekit[mongodb]
from synapsekit.loaders import MongoDBLoader

loader = MongoDBLoader(
uri="mongodb://localhost:27017",
database="mydb",
collection="articles",
text_fields=["title", "body"],
metadata_fields=["author", "created_at"],
query_filter={"published": True},
)
docs = loader.load()

DropboxLoader

Load files from a Dropbox folder.

pip install synapsekit[dropbox]
from synapsekit.loaders import DropboxLoader

loader = DropboxLoader(
access_token="sl.xxx",
folder_path="/Reports",
extensions=[".txt", ".md", ".pdf"],
limit=200,
)
docs = loader.load()

Supports 20+ text and code extensions. Skips files that fail to download instead of raising.


EPUBLoader

Load EPUB files chapter-by-chapter.

pip install synapsekit[epub]
from synapsekit.loaders import EPUBLoader

loader = EPUBLoader("book.epub")
docs = loader.load()
# One Document per chapter; metadata: title, author, chapter_name

LaTeXLoader

Load .tex files as plain text. No external dependencies.

from synapsekit.loaders import LaTeXLoader

loader = LaTeXLoader("paper.tex")
docs = loader.load()
# Strips commands, environments, math, and comments; captures section titles in metadata

TSVLoader

Load tab-separated files, one Document per row.

from synapsekit.loaders import TSVLoader

loader = TSVLoader("data.tsv", text_column="description")
docs = loader.load()
# text_column becomes the document body; all other columns become metadata

RTFLoader

Load RTF files as plain text via striprtf.

pip install synapsekit[rtf]
from synapsekit.loaders import RTFLoader

loader = RTFLoader("document.rtf")
docs = loader.load()

Default encoding is latin-1 (Windows-1252 superset) — the encoding used by Office and WordPad.


ConfigLoader

Load .env, .ini, .cfg, .toml, and environment-specific dotfiles into Documents.

from synapsekit.loaders import ConfigLoader

loader = ConfigLoader(".env.production")
docs = loader.load()
# Sensitive keys (password, secret, token, api_key, auth) are redacted automatically

Supported: .env, .env.local, .env.staging, .env.production, .ini, .cfg, .toml.


OneDriveLoader

Load files from OneDrive and SharePoint via Microsoft Graph API.

from synapsekit.loaders import OneDriveLoader

loader = OneDriveLoader(
access_token="Bearer ...",
folder_path="/Documents/Reports",
recursive=True,
extensions=[".pdf", ".docx"],
max_files=100,
)
docs = loader.load()

Extracts PDF, DOCX, XLSX, PPTX, CSV, JSON, HTML via existing loaders. Uses stdlib HTTP — no external SDK required.


ParquetLoader

Load Parquet files as Documents, one Document per row.

pip install synapsekit[parquet]
from synapsekit.loaders import ParquetLoader

loader = ParquetLoader("data.parquet", text_column="content")
docs = loader.load()
# text_column becomes the document body; remaining columns become metadata

Supports local files and remote URLs via pandas.read_parquet.


RedisLoader

Load key/value pairs from a Redis database as Documents.

pip install synapsekit[redis]
from synapsekit.loaders import RedisLoader

loader = RedisLoader(
host="localhost",
port=6379,
pattern="docs:*", # key pattern filter
value_type="hash", # "string", "hash", or "json"
)
docs = loader.load()
# Each doc: key as metadata, value as text

ElasticsearchLoader

Load documents from an Elasticsearch index.

pip install synapsekit[elasticsearch]
from synapsekit.loaders import ElasticsearchLoader

loader = ElasticsearchLoader(
hosts=["http://localhost:9200"],
index="my-index",
text_field="content",
metadata_fields=["author", "timestamp"],
query={"match_all": {}}, # optional DSL query; omit for full scan
)
docs = loader.load()

DynamoDBLoader

Load items from an AWS DynamoDB table.

pip install synapsekit[dynamodb]
from synapsekit.loaders import DynamoDBLoader

# Full table scan
loader = DynamoDBLoader(
table_name="my-table",
text_fields=["title", "body"],
metadata_fields=["author", "created_at"],
region_name="us-east-1",
)
docs = loader.load()

# Query mode (specific partition)
loader = DynamoDBLoader(
table_name="my-table",
text_fields=["body"],
key_condition_expression="pk = :pk",
expression_attribute_values={":pk": {"S": "user#123"}},
)
docs = loader.load()

Automatically paginates using LastEvaluatedKey. Deserialises typed DynamoDB attribute values (S, N, BOOL, L, M, etc.).


YouTubeLoader

Load transcripts from YouTube videos via the youtube-transcript-api library.

pip install synapsekit[youtube]
from synapsekit.loaders import YouTubeLoader

loader = YouTubeLoader(video_id="dQw4w9WgXcQ", language="en")
docs = loader.load()
# docs[0].text → transcript text
# docs[0].metadata → {"source": "youtube", "video_id": "...", "language": "en"}
ParameterTypeDefaultDescription
video_idstrrequiredYouTube video ID
languagestr"en"Preferred transcript language

ObsidianLoader

Load an Obsidian vault directory, resolving [[wikilinks]] and YAML frontmatter.

# No extra install needed
from synapsekit.loaders import ObsidianLoader

loader = ObsidianLoader("/path/to/vault")
docs = loader.load()
# One Document per note; frontmatter promoted to metadata; [[links]] resolved to titles
ParameterTypeDefaultDescription
vault_pathstrrequiredPath to the Obsidian vault root
recursiveboolTrueTraverse subdirectories

AirtableLoader

Load records from an Airtable base via the Airtable REST API.

pip install synapsekit[airtable]
from synapsekit.loaders import AirtableLoader

loader = AirtableLoader(
api_key="keyXXX",
base_id="appXXX",
table_name="Tasks",
)
docs = loader.load()

SitemapLoader

Crawl a sitemap XML and load all linked pages.

pip install synapsekit[web]
from synapsekit.loaders import SitemapLoader

loader = SitemapLoader("https://example.com/sitemap.xml", max_pages=50)
docs = await loader.aload()

HubSpotLoader

Load HubSpot contacts, companies, or deals via the HubSpot API.

pip install synapsekit[hubspot]
from synapsekit.loaders import HubSpotLoader

loader = HubSpotLoader(
api_key="pat-...",
object_type="contacts", # "contacts", "companies", "deals"
)
docs = loader.load()

SalesforceLoader

Load Salesforce records via SOQL queries using the simple-salesforce client.

pip install synapsekit[salesforce]
from synapsekit.loaders import SalesforceLoader

loader = SalesforceLoader(
username="user@example.com",
password="...",
security_token="...",
soql="SELECT Id, Name, Description FROM Account LIMIT 100",
)
docs = loader.load()

BigQueryLoader

Load rows from a Google BigQuery table or SQL query.

pip install synapsekit[bigquery]
from synapsekit.loaders import BigQueryLoader

loader = BigQueryLoader(
project="my-gcp-project",
query="SELECT title, body FROM `my_dataset.articles` LIMIT 500",
credentials_path="service-account.json",
)
docs = loader.load()

PubMedLoader

Load PubMed abstracts and metadata by PMID list or free-text search. No extra dependencies.

# No extra install needed
from synapsekit.loaders import PubMedLoader

# Search by keyword
loader = PubMedLoader(query="retrieval augmented generation", max_results=10)
docs = loader.load()

# Load specific PMIDs
loader = PubMedLoader(pmids=["37160872", "36823232"])
docs = loader.load()

SnowflakeLoader

Load rows from Snowflake via a SQL query.

pip install synapsekit[snowflake]
from synapsekit.loaders import SnowflakeLoader

loader = SnowflakeLoader(
account="myaccount.us-east-1",
user="myuser",
password="...",
warehouse="COMPUTE_WH",
database="MY_DB",
schema="PUBLIC",
query="SELECT title, content FROM articles WHERE active = TRUE",
)
docs = loader.load()

FirestoreLoader

Load documents from a Google Firestore collection.

pip install synapsekit[firestore]
from synapsekit.loaders import FirestoreLoader

loader = FirestoreLoader(
collection="articles",
credentials_path="service-account.json",
text_fields=["title", "body"],
)
docs = loader.load()

ZendeskLoader

Load tickets from Zendesk via the Support API.

pip install synapsekit[zendesk]
from synapsekit.loaders import ZendeskLoader

loader = ZendeskLoader(
subdomain="mycompany",
email="agent@example.com",
api_token="your-api-token",
status="open", # "open", "pending", "solved", "closed", or None for all
)
docs = loader.load()

IntercomLoader

Load conversations from Intercom via the REST API.

pip install synapsekit[intercom]
from synapsekit.loaders import IntercomLoader

loader = IntercomLoader(access_token="dG9rXXX", max_results=200)
docs = loader.load()
# Each Document contains one conversation thread

FreshdeskLoader

Load tickets from Freshdesk via the v2 API.

pip install synapsekit[freshdesk]
from synapsekit.loaders import FreshdeskLoader

loader = FreshdeskLoader(
domain="mycompany.freshdesk.com",
api_key="your-api-key",
status=2, # 2 = open; see Freshdesk docs for other status codes
)
docs = loader.load()

HackerNewsLoader

Load HN stories and comments via the Firebase REST API. No extra dependencies.

# No extra install needed
from synapsekit.loaders import HackerNewsLoader

# Top stories
loader = HackerNewsLoader(story_type="top", max_stories=30)
docs = loader.load()

# Load a specific item by ID
loader = HackerNewsLoader(item_id=39443107)
docs = loader.load()

RedditLoader

Load Reddit posts and comments via PRAW.

pip install synapsekit[reddit]
from synapsekit.loaders import RedditLoader

loader = RedditLoader(
client_id="...",
client_secret="...",
user_agent="synapsekit/1.0",
subreddit="MachineLearning",
post_limit=25,
include_comments=True,
)
docs = loader.load()

TwitterLoader

Load tweets via the Twitter API v2.

pip install synapsekit[twitter]
from synapsekit.loaders import TwitterLoader

loader = TwitterLoader(
bearer_token="AAAA...",
query="retrieval augmented generation lang:en",
max_results=50,
)
docs = loader.load()

GoogleCalendarLoader

Load events from Google Calendar via the Calendar API v3.

pip install synapsekit[gcal]
from synapsekit.loaders import GoogleCalendarLoader

loader = GoogleCalendarLoader(
credentials_path="service-account.json",
calendar_id="primary",
time_min="2026-01-01T00:00:00Z",
max_results=100,
)
docs = loader.load()
# Each event becomes one Document; metadata: summary, start, end, location, attendees

TrelloLoader

Load Trello cards and boards via the Trello REST API.

pip install synapsekit[trello]
from synapsekit.loaders import TrelloLoader

loader = TrelloLoader(
api_key="...",
api_token="...",
board_id="...",
)
docs = loader.load()
# Each card becomes one Document; metadata: list_name, labels, due_date, url

Loading into the RAG facade

All loaders return List[Document], which you can pass directly to add_documents():

from synapsekit import RAG, PDFLoader, DirectoryLoader

rag = RAG(model="gpt-4o-mini", api_key="sk-...")

# Single loader
rag.add_documents(PDFLoader("report.pdf").load())

# Multiple loaders
from itertools import chain
docs = list(chain(
PDFLoader("report.pdf").load(),
DirectoryLoader("./notes/").load(),
))
rag.add_documents(docs)

answer = rag.ask_sync("Summarize everything.")