TextSpitter
TextSpitter — a text-extraction library that facilitates string consumption.
1""" 2TextSpitter — a text-extraction library that facilitates string consumption. 3""" 4 5from importlib.metadata import PackageNotFoundError, version 6 7try: 8 __version__ = version("textspitter") 9except PackageNotFoundError: 10 __version__ = "unknown" 11 12from .main import WordLoader 13 14__all__ = ["TextSpitter", "WordLoader", "__version__"] 15 16 17def TextSpitter( 18 file_obj=None, filename: str | None = None, file_attr: str = "name" 19) -> str: 20 """ 21 Extract text from a file and return it as a string. 22 23 Args: 24 file_obj: A file path (str/Path), file-like object, bytes, or None. 25 filename: Filename with extension. Used when file_obj has no name 26 attribute, or as the sole argument for path-based loading. 27 file_attr: Attribute name to read from file_obj for its filename. 28 Defaults to "name". 29 30 Returns: 31 str: Extracted text content. 32 """ 33 return WordLoader( 34 file_obj=file_obj, filename=filename, file_attr=file_attr 35 ).file_load()
18def TextSpitter( 19 file_obj=None, filename: str | None = None, file_attr: str = "name" 20) -> str: 21 """ 22 Extract text from a file and return it as a string. 23 24 Args: 25 file_obj: A file path (str/Path), file-like object, bytes, or None. 26 filename: Filename with extension. Used when file_obj has no name 27 attribute, or as the sole argument for path-based loading. 28 file_attr: Attribute name to read from file_obj for its filename. 29 Defaults to "name". 30 31 Returns: 32 str: Extracted text content. 33 """ 34 return WordLoader( 35 file_obj=file_obj, filename=filename, file_attr=file_attr 36 ).file_load()
Extract text from a file and return it as a string.
Args: file_obj: A file path (str/Path), file-like object, bytes, or None. filename: Filename with extension. Used when file_obj has no name attribute, or as the sole argument for path-based loading. file_attr: Attribute name to read from file_obj for its filename. Defaults to "name".
Returns: str: Extracted text content.
12class WordLoader: 13 """ 14 Dispatch wrapper that routes a file to the correct 15 :class:`~TextSpitter.core.FileExtractor` reader. 16 17 Accepts a file-system path (``str`` or :class:`pathlib.Path`) and an 18 optional *filename* hint. Strings are converted to 19 :class:`~pathlib.Path` objects automatically. 20 21 Use :class:`~TextSpitter.core.FileExtractor` directly if you need to 22 pass a ``BytesIO``, ``SpooledTemporaryFile``, or raw ``bytes``. 23 """ 24 25 FILE_EXT_MATRIX: dict[str, str] = { 26 "pdf": "pdf_file_read", 27 "docx": "docx_file_read", 28 "txt": "text_file_read", 29 "text": "text_file_read", 30 "csv": "csv_file_read", 31 } 32 33 TEXT_MIME_TYPES: frozenset[str] = frozenset( 34 { 35 "plain", 36 "javascript", 37 "x-python", 38 "x-c", 39 "x-java-source", 40 "x-c++", 41 "html", 42 "css", 43 "json", 44 "xml", 45 } 46 ) 47 48 def __init__( 49 self, 50 file_obj: str | Path | None = None, 51 filename: str | None = None, 52 file_attr: str = "name", 53 ): 54 if isinstance(file_obj, str): 55 file_obj = Path(file_obj) 56 self.file = FileExtractor( 57 file_obj=file_obj, filename=filename, file_attr=file_attr 58 ) 59 60 def file_load(self) -> str: 61 """ 62 The primary function for this object. The file is processed and then 63 sent to the appropriate text extraction function based on the 64 appropriate file mimetype. 65 66 Returns: 67 str 68 """ 69 file_type = self.file.file_ext.lower() 70 71 # Check if it's a specific supported format first 72 if file_type in self.FILE_EXT_MATRIX: 73 text = getattr(self.file, self.FILE_EXT_MATRIX[file_type])() 74 return text 75 # Check if it's a programming language file 76 elif self.file.is_programming_language_file(file_type): 77 logger.info( 78 f"Processing programming language file: {self.file.file_name}" 79 ) 80 text = self.file.code_file_read() 81 return text 82 else: 83 # Fall back to mime type detection 84 mime_type = self.file.get_file_type(self.file.file_name) 85 86 # Check if mime type suggests it's a text-based file 87 if mime_type in self.TEXT_MIME_TYPES: 88 logger.info( 89 f"Processing text-based file by mime type: {mime_type}" 90 ) 91 text = ( 92 self.file.code_file_read() 93 ) # Use code_file_read for better encoding handling 94 return text 95 96 logger.error( 97 f"You are using an incorrect file format for file submissions. " 98 f"Please upload a .docx/.doc/.txt/.pdf file or a supported " 99 f"programming language file (.py, .js, .java, .cpp, etc.). " 100 f"Note the mimetype of your submitted data and submit an " 101 f"error report to github with the following: {mime_type}" 102 ) 103 104 return ""
Dispatch wrapper that routes a file to the correct
~TextSpitter.core.FileExtractor reader.
Accepts a file-system path (str or pathlib.Path) and an
optional filename hint. Strings are converted to
~pathlib.Path objects automatically.
Use ~TextSpitter.core.FileExtractor directly if you need to
pass a BytesIO, SpooledTemporaryFile, or raw bytes.
60 def file_load(self) -> str: 61 """ 62 The primary function for this object. The file is processed and then 63 sent to the appropriate text extraction function based on the 64 appropriate file mimetype. 65 66 Returns: 67 str 68 """ 69 file_type = self.file.file_ext.lower() 70 71 # Check if it's a specific supported format first 72 if file_type in self.FILE_EXT_MATRIX: 73 text = getattr(self.file, self.FILE_EXT_MATRIX[file_type])() 74 return text 75 # Check if it's a programming language file 76 elif self.file.is_programming_language_file(file_type): 77 logger.info( 78 f"Processing programming language file: {self.file.file_name}" 79 ) 80 text = self.file.code_file_read() 81 return text 82 else: 83 # Fall back to mime type detection 84 mime_type = self.file.get_file_type(self.file.file_name) 85 86 # Check if mime type suggests it's a text-based file 87 if mime_type in self.TEXT_MIME_TYPES: 88 logger.info( 89 f"Processing text-based file by mime type: {mime_type}" 90 ) 91 text = ( 92 self.file.code_file_read() 93 ) # Use code_file_read for better encoding handling 94 return text 95 96 logger.error( 97 f"You are using an incorrect file format for file submissions. " 98 f"Please upload a .docx/.doc/.txt/.pdf file or a supported " 99 f"programming language file (.py, .js, .java, .cpp, etc.). " 100 f"Note the mimetype of your submitted data and submit an " 101 f"error report to github with the following: {mime_type}" 102 ) 103 104 return ""
The primary function for this object. The file is processed and then sent to the appropriate text extraction function based on the appropriate file mimetype.
Returns: str