← Back to Home
From
From
From raw
Copy-paste snippets for common tasks
Input handling
From BytesIO
from io import BytesIO
from TextSpitter import TextSpitter
with open("report.pdf", "rb") as f:
text = TextSpitter(file_obj=BytesIO(f.read()), filename="report.pdf")
# Stream is rewound automatically — no seek(0) needed.From SpooledTemporaryFile
from tempfile import SpooledTemporaryFile
from TextSpitter import TextSpitter
with SpooledTemporaryFile(max_size=10 * 1024 * 1024) as stf:
stf.write(pdf_bytes)
stf.seek(0)
text = TextSpitter(file_obj=stf, filename="upload.pdf")From raw bytes
from TextSpitter import TextSpitter
text = TextSpitter(file_obj=pdf_bytes, filename="document.pdf")Custom filename attribute
Some frameworks expose the original name on a non-standard attribute. Pass file_attr to tell FileExtractor where to look:
from TextSpitter.core import FileExtractor
fe = FileExtractor(file_obj=upload_obj, file_attr="original_name")
text = fe.text_file_read()Format-specific
Force pypdf (skip PyMuPDF)
import TextSpitter.core as _core
from TextSpitter.core import FileExtractor
_real, _core.pymupdf = _core.pymupdf, None
try:
text = FileExtractor(file_obj=pdf_bytes, filename="doc.pdf").pdf_file_read()
finally:
_core.pymupdf = _realDOCX paragraphs as a list
from TextSpitter import TextSpitter
text = TextSpitter(filename="doc.docx")
paragraphs = [ln for ln in text.splitlines() if ln.strip()]CSV into rows
import csv, io
from TextSpitter import TextSpitter
raw = TextSpitter(filename="data.csv")
rows = list(csv.DictReader(io.StringIO(raw)))
print(rows[0])Error and encoding
Detect decode warnings (loguru)
from loguru import logger
seen = []
logger.add(lambda m: seen.append(m.record["message"]), level="WARNING")
from TextSpitter import TextSpitter
TextSpitter(filename="legacy.txt")
if seen:
print("Encoding issues:", seen)Validate extraction result
from TextSpitter import TextSpitter
def safe_extract(path: str) -> str | None:
text = TextSpitter(filename=path)
return text if text and text.strip() else NoneTesting
Unit-test a function that calls TextSpitter
def test_my_function(tmp_path):
(tmp_path / "doc.txt").write_text("Hello world")
result = my_function(str(tmp_path / "doc.txt"))
assert "Hello" in resultMock TextSpitter in integration tests
from unittest.mock import patch
with patch("myapp.routes.TextSpitter", return_value="mocked text"):
response = client.post("/extract", ...)
assert response.json()["text"] == "mocked text"Capture log output
TextSpitter tests include a log_capture fixture that works with or without loguru:
def test_warns_on_bad_encoding(log_capture, tmp_path):
(tmp_path / "bad.txt").write_bytes(b"\\x81\\xfe\\xff")
from TextSpitter import TextSpitter
TextSpitter(filename=str(tmp_path / "bad.txt"))
assert any("utf-8 or latin-1" in m for m in log_capture)See the project's tests/conftest.py for the fixture definition.