Progress on HTML converter.

This commit is contained in:
Adam Fourney 2025-03-04 08:33:50 -08:00
parent 4129f30c23
commit df372fa460

View file

@ -1,9 +1,20 @@
from typing import Any, Union from typing import Any, Union, BinaryIO
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/html",
"application/xhtml",
]
ACCEPTED_FILE_EXTENSIONS = [
".html",
".htm",
]
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
"""Anything with content type text/html""" """Anything with content type text/html"""
@ -13,12 +24,29 @@ class HtmlConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert_stream(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> Union[None, DocumentConverterResult]:
# Bail if not html
if not self._is_html(stream_info):
return None
# Read the stream into a string
html_content = str(
file_stream.read(),
encoding=stream_info.charset if stream_info.charset else "utf-8",
)
return self._convert(html_content)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not html # Bail if not html
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]: if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
return None return None
result = None result = None
@ -27,6 +55,20 @@ class HtmlConverter(DocumentConverter):
return result return result
def _is_html(self, stream_info: StreamInfo) -> bool:
"""Helper function that checks if the stream is html."""
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
"""Helper function that converts an HTML string.""" """Helper function that converts an HTML string."""