From df372fa460cae44a6923ae12d00ae83e81bea60c Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 4 Mar 2025 08:33:50 -0800 Subject: [PATCH] Progress on HTML converter. --- .../markitdown/converters/_html_converter.py | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 172875e..abd5013 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -1,9 +1,20 @@ -from typing import Any, Union +from typing import Any, Union, BinaryIO from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + class HtmlConverter(DocumentConverter): """Anything with content type text/html""" @@ -13,12 +24,29 @@ class HtmlConverter(DocumentConverter): ): super().__init__(priority=priority) + def convert_stream( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> Union[None, DocumentConverterResult]: + # Bail if not html + if not self._is_html(stream_info): + return None + + # Read the stream into a string + html_content = str( + file_stream.read(), + encoding=stream_info.charset if stream_info.charset else "utf-8", + ) + return self._convert(html_content) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: # Bail if not html extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: + if extension.lower() not in ACCEPTED_FILE_EXTENSIONS: return None result = None @@ -27,6 +55,20 @@ class HtmlConverter(DocumentConverter): return result + def _is_html(self, stream_info: StreamInfo) -> bool: + """Helper function that checks if the stream is html.""" + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: """Helper function that converts an HTML string."""