TextSpitter

TextSpitter — a text-extraction library that facilitates string consumption.

View Source

 1"""
 2TextSpitter — a text-extraction library that facilitates string consumption.
 3"""
 4
 5from importlib.metadata import PackageNotFoundError, version
 6
 7try:
 8    __version__ = version("textspitter")
 9except PackageNotFoundError:
10    __version__ = "unknown"
11
12from .main import WordLoader
13
14__all__ = ["TextSpitter", "WordLoader", "__version__"]
15
16
17def TextSpitter(
18    file_obj=None, filename: str | None = None, file_attr: str = "name"
19) -> str:
20    """
21    Extract text from a file and return it as a string.
22
23    Args:
24        file_obj: A file path (str/Path), file-like object, bytes, or None.
25        filename: Filename with extension. Used when file_obj has no name
26                  attribute, or as the sole argument for path-based loading.
27        file_attr: Attribute name to read from file_obj for its filename.
28                   Defaults to "name".
29
30    Returns:
31        str: Extracted text content.
32    """
33    return WordLoader(
34        file_obj=file_obj, filename=filename, file_attr=file_attr
35    ).file_load()

def TextSpitter( file_obj=None, filename: str | None = None, file_attr: str = 'name') -> str: View Source

18def TextSpitter(
19    file_obj=None, filename: str | None = None, file_attr: str = "name"
20) -> str:
21    """
22    Extract text from a file and return it as a string.
23
24    Args:
25        file_obj: A file path (str/Path), file-like object, bytes, or None.
26        filename: Filename with extension. Used when file_obj has no name
27                  attribute, or as the sole argument for path-based loading.
28        file_attr: Attribute name to read from file_obj for its filename.
29                   Defaults to "name".
30
31    Returns:
32        str: Extracted text content.
33    """
34    return WordLoader(
35        file_obj=file_obj, filename=filename, file_attr=file_attr
36    ).file_load()

Extract text from a file and return it as a string.

Args: file_obj: A file path (str/Path), file-like object, bytes, or None. filename: Filename with extension. Used when file_obj has no name attribute, or as the sole argument for path-based loading. file_attr: Attribute name to read from file_obj for its filename. Defaults to "name".

Returns: str: Extracted text content.

class WordLoader: View Source

 12class WordLoader:
 13    """
 14    Dispatch wrapper that routes a file to the correct
 15    :class:`~TextSpitter.core.FileExtractor` reader.
 16
 17    Accepts a file-system path (``str`` or :class:`pathlib.Path`) and an
 18    optional *filename* hint.  Strings are converted to
 19    :class:`~pathlib.Path` objects automatically.
 20
 21    Use :class:`~TextSpitter.core.FileExtractor` directly if you need to
 22    pass a ``BytesIO``, ``SpooledTemporaryFile``, or raw ``bytes``.
 23    """
 24
 25    FILE_EXT_MATRIX: dict[str, str] = {
 26        "pdf": "pdf_file_read",
 27        "docx": "docx_file_read",
 28        "txt": "text_file_read",
 29        "text": "text_file_read",
 30        "csv": "csv_file_read",
 31    }
 32
 33    TEXT_MIME_TYPES: frozenset[str] = frozenset(
 34        {
 35            "plain",
 36            "javascript",
 37            "x-python",
 38            "x-c",
 39            "x-java-source",
 40            "x-c++",
 41            "html",
 42            "css",
 43            "json",
 44            "xml",
 45        }
 46    )
 47
 48    def __init__(
 49        self,
 50        file_obj: str | Path | None = None,
 51        filename: str | None = None,
 52        file_attr: str = "name",
 53    ):
 54        if isinstance(file_obj, str):
 55            file_obj = Path(file_obj)
 56        self.file = FileExtractor(
 57            file_obj=file_obj, filename=filename, file_attr=file_attr
 58        )
 59
 60    def file_load(self) -> str:
 61        """
 62        The primary function for this object. The file is processed and then
 63        sent to the appropriate text extraction function based on the
 64        appropriate file mimetype.
 65
 66        Returns:
 67            str
 68        """
 69        file_type = self.file.file_ext.lower()
 70
 71        # Check if it's a specific supported format first
 72        if file_type in self.FILE_EXT_MATRIX:
 73            text = getattr(self.file, self.FILE_EXT_MATRIX[file_type])()
 74            return text
 75        # Check if it's a programming language file
 76        elif self.file.is_programming_language_file(file_type):
 77            logger.info(
 78                f"Processing programming language file: {self.file.file_name}"
 79            )
 80            text = self.file.code_file_read()
 81            return text
 82        else:
 83            # Fall back to mime type detection
 84            mime_type = self.file.get_file_type(self.file.file_name)
 85
 86            # Check if mime type suggests it's a text-based file
 87            if mime_type in self.TEXT_MIME_TYPES:
 88                logger.info(
 89                    f"Processing text-based file by mime type: {mime_type}"
 90                )
 91                text = (
 92                    self.file.code_file_read()
 93                )  # Use code_file_read for better encoding handling
 94                return text
 95
 96            logger.error(
 97                f"You are using an incorrect file format for file submissions. "
 98                f"Please upload a .docx/.doc/.txt/.pdf file or a supported "
 99                f"programming language file (.py, .js, .java, .cpp, etc.). "
100                f"Note the mimetype of your submitted data and submit an "
101                f"error report to github with the following: {mime_type}"
102            )
103
104            return ""

Dispatch wrapper that routes a file to the correct ~TextSpitter.core.FileExtractor reader.

Accepts a file-system path (str or pathlib.Path) and an optional filename hint. Strings are converted to ~pathlib.Path objects automatically.

Use ~TextSpitter.core.FileExtractor directly if you need to pass a BytesIO, SpooledTemporaryFile, or raw bytes.

WordLoader( file_obj: str | pathlib.Path | None = None, filename: str | None = None, file_attr: str = 'name') View Source

48    def __init__(
49        self,
50        file_obj: str | Path | None = None,
51        filename: str | None = None,
52        file_attr: str = "name",
53    ):
54        if isinstance(file_obj, str):
55            file_obj = Path(file_obj)
56        self.file = FileExtractor(
57            file_obj=file_obj, filename=filename, file_attr=file_attr
58        )

FILE_EXT_MATRIX: dict[str, str] = {'pdf': 'pdf_file_read', 'docx': 'docx_file_read', 'txt': 'text_file_read', 'text': 'text_file_read', 'csv': 'csv_file_read'}

TEXT_MIME_TYPES: frozenset[str] = frozenset({'plain', 'html', 'json', 'x-python', 'javascript', 'x-c', 'x-c++', 'xml', 'x-java-source', 'css'})

file

def file_load(self) -> str: View Source

 60    def file_load(self) -> str:
 61        """
 62        The primary function for this object. The file is processed and then
 63        sent to the appropriate text extraction function based on the
 64        appropriate file mimetype.
 65
 66        Returns:
 67            str
 68        """
 69        file_type = self.file.file_ext.lower()
 70
 71        # Check if it's a specific supported format first
 72        if file_type in self.FILE_EXT_MATRIX:
 73            text = getattr(self.file, self.FILE_EXT_MATRIX[file_type])()
 74            return text
 75        # Check if it's a programming language file
 76        elif self.file.is_programming_language_file(file_type):
 77            logger.info(
 78                f"Processing programming language file: {self.file.file_name}"
 79            )
 80            text = self.file.code_file_read()
 81            return text
 82        else:
 83            # Fall back to mime type detection
 84            mime_type = self.file.get_file_type(self.file.file_name)
 85
 86            # Check if mime type suggests it's a text-based file
 87            if mime_type in self.TEXT_MIME_TYPES:
 88                logger.info(
 89                    f"Processing text-based file by mime type: {mime_type}"
 90                )
 91                text = (
 92                    self.file.code_file_read()
 93                )  # Use code_file_read for better encoding handling
 94                return text
 95
 96            logger.error(
 97                f"You are using an incorrect file format for file submissions. "
 98                f"Please upload a .docx/.doc/.txt/.pdf file or a supported "
 99                f"programming language file (.py, .js, .java, .cpp, etc.). "
100                f"Note the mimetype of your submitted data and submit an "
101                f"error report to github with the following: {mime_type}"
102            )
103
104            return ""

The primary function for this object. The file is processed and then sent to the appropriate text extraction function based on the appropriate file mimetype.

Returns: str

__version__ = '1.0.0'