Recipes - TextSpitter

← Back to Home

Copy-paste snippets for common tasks

Input handling

From BytesIO

from io import BytesIO
from TextSpitter import TextSpitter

with open("report.pdf", "rb") as f:
    text = TextSpitter(file_obj=BytesIO(f.read()), filename="report.pdf")
# Stream is rewound automatically — no seek(0) needed.

From SpooledTemporaryFile

from tempfile import SpooledTemporaryFile
from TextSpitter import TextSpitter

with SpooledTemporaryFile(max_size=10 * 1024 * 1024) as stf:
    stf.write(pdf_bytes)
    stf.seek(0)
    text = TextSpitter(file_obj=stf, filename="upload.pdf")

From raw bytes

from TextSpitter import TextSpitter

text = TextSpitter(file_obj=pdf_bytes, filename="document.pdf")

Custom filename attribute

Some frameworks expose the original name on a non-standard attribute. Pass file_attr to tell FileExtractor where to look:

from TextSpitter.core import FileExtractor

fe = FileExtractor(file_obj=upload_obj, file_attr="original_name")
text = fe.text_file_read()

Format-specific

Force pypdf (skip PyMuPDF)

import TextSpitter.core as _core
from TextSpitter.core import FileExtractor

_real, _core.pymupdf = _core.pymupdf, None
try:
    text = FileExtractor(file_obj=pdf_bytes, filename="doc.pdf").pdf_file_read()
finally:
    _core.pymupdf = _real

DOCX paragraphs as a list

from TextSpitter import TextSpitter

text = TextSpitter(filename="doc.docx")
paragraphs = [ln for ln in text.splitlines() if ln.strip()]

CSV into rows

import csv, io
from TextSpitter import TextSpitter

raw = TextSpitter(filename="data.csv")
rows = list(csv.DictReader(io.StringIO(raw)))
print(rows[0])

Error and encoding

Detect decode warnings (loguru)

from loguru import logger

seen = []
logger.add(lambda m: seen.append(m.record["message"]), level="WARNING")

from TextSpitter import TextSpitter
TextSpitter(filename="legacy.txt")

if seen:
    print("Encoding issues:", seen)

Validate extraction result

from TextSpitter import TextSpitter

def safe_extract(path: str) -> str | None:
    text = TextSpitter(filename=path)
    return text if text and text.strip() else None

Testing

Unit-test a function that calls TextSpitter

def test_my_function(tmp_path):
    (tmp_path / "doc.txt").write_text("Hello world")
    result = my_function(str(tmp_path / "doc.txt"))
    assert "Hello" in result

Mock TextSpitter in integration tests

from unittest.mock import patch

with patch("myapp.routes.TextSpitter", return_value="mocked text"):
    response = client.post("/extract", ...)

assert response.json()["text"] == "mocked text"

Capture log output

TextSpitter tests include a log_capture fixture that works with or without loguru:

def test_warns_on_bad_encoding(log_capture, tmp_path):
    (tmp_path / "bad.txt").write_bytes(b"\\x81\\xfe\\xff")
    from TextSpitter import TextSpitter
    TextSpitter(filename=str(tmp_path / "bad.txt"))
    assert any("utf-8 or latin-1" in m for m in log_capture)

See the project's tests/conftest.py for the fixture definition.