Progress on HTML converter.
This commit is contained in:
parent
4129f30c23
commit
df372fa460
1 changed files with 44 additions and 2 deletions
|
|
@ -1,9 +1,20 @@
|
|||
from typing import Any, Union
|
||||
from typing import Any, Union, BinaryIO
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
|
@ -13,12 +24,29 @@ class HtmlConverter(DocumentConverter):
|
|||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert_stream(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not html
|
||||
if not self._is_html(stream_info):
|
||||
return None
|
||||
|
||||
# Read the stream into a string
|
||||
html_content = str(
|
||||
file_stream.read(),
|
||||
encoding=stream_info.charset if stream_info.charset else "utf-8",
|
||||
)
|
||||
return self._convert(html_content)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not html
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".html", ".htm"]:
|
||||
if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
|
||||
return None
|
||||
|
||||
result = None
|
||||
|
|
@ -27,6 +55,20 @@ class HtmlConverter(DocumentConverter):
|
|||
|
||||
return result
|
||||
|
||||
def _is_html(self, stream_info: StreamInfo) -> bool:
|
||||
"""Helper function that checks if the stream is html."""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
||||
"""Helper function that converts an HTML string."""
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue