Skip to content

Html Module

HTML file handler for text extraction.

Classes:

Name Description
HTMLHandler

Handler for extracting text from HTML files.

Classes

HTMLHandler

Bases: FileTypeHandler

Handler for extracting text from HTML files.

Methods:

Name Description
extract
extract_async
Source code in textxtract/handlers/html.py
class HTMLHandler(FileTypeHandler):
    """Handler for extracting text from HTML files."""

    def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
        try:
            try:
                from bs4 import BeautifulSoup
            except ImportError:
                raise ExtractionError(
                    "beautifulsoup4 package is not installed. Install with 'pip install text-extractor[html]'"
                )
            text = file_path.read_text(encoding=(config or {}).get("encoding", "utf-8"))
            soup = BeautifulSoup(text, "html.parser")
            return soup.get_text()
        except Exception as e:
            raise ExtractionError(f"HTML extraction failed: {e}")

    async def extract_async(
        self, file_path: Path, config: Optional[dict] = None
    ) -> str:
        import asyncio

        return await asyncio.to_thread(self.extract, file_path, config)

Functions

extract
extract(file_path, config=None)
Source code in textxtract/handlers/html.py
def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
    try:
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise ExtractionError(
                "beautifulsoup4 package is not installed. Install with 'pip install text-extractor[html]'"
            )
        text = file_path.read_text(encoding=(config or {}).get("encoding", "utf-8"))
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()
    except Exception as e:
        raise ExtractionError(f"HTML extraction failed: {e}")
extract_async async
extract_async(file_path, config=None)
Source code in textxtract/handlers/html.py
async def extract_async(
    self, file_path: Path, config: Optional[dict] = None
) -> str:
    import asyncio

    return await asyncio.to_thread(self.extract, file_path, config)