Updating converters.

This commit is contained in:
Adam Fourney 2025-03-04 13:57:49 -08:00
parent df372fa460
commit 4d09a4c6c6
8 changed files with 366 additions and 207 deletions

View file

@ -80,23 +80,46 @@ class DocumentConverter:
"""
self._priority = priority
def convert_stream(
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> Union[None, DocumentConverterResult]:
) -> bool:
"""
Convert a document to Markdown text, or return None if the converter
cannot handle the document (causing the next converter to be tried).
Return a quick determination on if the converter should attempt converting the document.
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to
make a determination (e.g., special converters for Wikipedia, YouTube etc).
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
The determination of whether a converter can handle a document is primarily based on
the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
a secondary check if the MIME type is not sufficiently specific
(e.g., application/octet-stream). In the case of data retreived via HTTP, the
`steam_info.url` might also be referenced to guide conversion (e.g., special-handling
for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
of the file content in cases of text/*
NOTE: The method signature is designed to match that of the convert() method. This provides some
assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
IMPORTANT: If this method advances the position in file_stream, it must also reset the position before
returning. This is because the convert() method may be called immediately after accepts().
Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
- kwargs: Additional keyword arguments for the converter.
Returns:
- bool: True if the converter can handle the document, False otherwise.
"""
raise NotImplementedError(
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
)
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
"""
Convert a document to Markdown text.
Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
@ -105,68 +128,11 @@ class DocumentConverter:
Returns:
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
or
- None: If the converter cannot handle the document.
Raises:
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
- MissingDependencyException: If the converter requires a dependency that is not installed.
"""
# Default implementation ensures backward compatibility with the legacy convert() method, and
# should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
# in the future.
result = None
used_legacy = False
if stream_info.local_path is not None and os.path.exists(
stream_info.local_path
):
# If the stream is backed by a local file, pass it to the legacy convert() method
try:
result = self.convert(stream_info.local_path, **kwargs)
used_legacy = True
except (
NotImplementedError
): # If it wasn't implemented, rethrow the error, but with this as the stack trace
raise NotImplementedError(
"Subclasses must implement the convert_stream method."
)
else:
# Otherwise, we need to read the stream into a temporary file. There is potential for
# thrashing here if there are many converters or conversion attempts
cur_pos = file_stream.tell()
temp_fd, temp_path = tempfile.mkstemp()
try:
with os.fdopen(temp_fd, "wb") as temp_file:
temp_file.write(file_stream.read())
try:
result = self.convert(temp_path, **kwargs)
used_legacy = True
except NotImplementedError:
raise NotImplementedError(
"Subclasses must implement the convert_stream method."
)
finally:
os.remove(temp_path)
file_stream.seek(0)
if used_legacy:
message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
if message not in _WARNED:
warn(message, DeprecationWarning)
_WARNED.append(message)
return result
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
"""
Legacy, and deprecated method to convert a document to Markdown text.
This method reads from the file at `local_path` and returns the converted Markdown text.
This method is deprecated in favor of `convert_stream`, which uses a file-like object.
"""
raise NotImplementedError("Subclasses must implement this method")
@property

View file

@ -414,8 +414,16 @@ class MarkItDown:
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
# Remember the initial stream position so that we can return to it
cur_pos = file_stream.tell()
for stream_info in stream_info_guesses + [StreamInfo()]:
for converter in sorted_converters:
# Sanity check -- make sure the cur_pos is still the same
assert (
cur_pos == file_stream.tell()
), f"File stream position should NOT change between guess iterations"
_kwargs = copy.deepcopy(kwargs)
# Copy any additional global options
@ -442,17 +450,29 @@ class MarkItDown:
if stream_info.url is not None:
_kwargs["url"] = stream_info.url
# Attempt the conversion
cur_pos = file_stream.tell()
# Check if the converter will accept the file, and if so, try to convert it
_accepts = False
try:
res = converter.convert_stream(file_stream, stream_info, **_kwargs)
except Exception:
failed_attempts.append(
FailedConversionAttempt(
converter=converter, exc_info=sys.exc_info()
)
)
finally:
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
except NotImplementedError:
pass
# accept() should not have changed the file stream position
assert (
cur_pos == file_stream.tell()
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
# Attempt the conversion
if _accepts:
# try:
res = converter.convert(file_stream, stream_info, **_kwargs)
# except Exception:
# failed_attempts.append(
# FailedConversionAttempt(
# converter=converter, exc_info=sys.exc_info()
# )
# )
# finally:
file_stream.seek(cur_pos)
if res is not None:

View file

@ -1,14 +1,24 @@
# type: ignore
import base64
import io
import re
from typing import Union
import base64
from urllib.parse import parse_qs, urlparse
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/html",
"application/xhtml",
]
ACCEPTED_FILE_EXTENSIONS = [
".html",
".htm",
]
class BingSerpConverter(DocumentConverter):
"""
@ -21,23 +31,46 @@ class BingSerpConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]:
return None
url = kwargs.get("url", "")
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
"""
Make sure we're dealing with HTML content *from* Bing.
"""
url = (stream_info.url or "").lower()
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
# Not a Bing SERP URL
return False
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Not HTML content
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the query parameters
parsed_params = parse_qs(urlparse(url).query)
parsed_params = parse_qs(urlparse(stream_info.url).query)
query = parsed_params.get("q", [""])[0]
# Parse the file
soup = None
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
# Clean up some formatting
for tptt in soup.find_all(class_="tptt"):

View file

@ -1,9 +1,10 @@
import sys
from typing import Union
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
@ -16,6 +17,13 @@ except ImportError:
_dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
ACCEPTED_FILE_EXTENSIONS = [".docx"]
class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@ -25,13 +33,32 @@ class DocxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
self._html_converter = HtmlConverter()
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx":
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
@ -44,12 +71,7 @@ class DocxConverter(HtmlConverter):
_dependency_exc_info[2]
) # Restore the original traceback
result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)
return result
style_map = kwargs.get("style_map", None)
return self._html_converter.convert_string(
mammoth.convert_to_html(file_stream, style_map=style_map).value
)

View file

@ -1,4 +1,5 @@
from typing import Any, Union, BinaryIO
import io
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
@ -24,39 +25,12 @@ class HtmlConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert_stream(
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> Union[None, DocumentConverterResult]:
# Bail if not html
if not self._is_html(stream_info):
return None
# Read the stream into a string
html_content = str(
file_stream.read(),
encoding=stream_info.charset if stream_info.charset else "utf-8",
)
return self._convert(html_content)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not html
extension = kwargs.get("file_extension", "")
if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
return None
result = None
with open(local_path, "rt", encoding="utf-8") as fh:
result = self._convert(fh.read())
return result
def _is_html(self, stream_info: StreamInfo) -> bool:
"""Helper function that checks if the stream is html."""
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
@ -69,11 +43,14 @@ class HtmlConverter(DocumentConverter):
return False
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
"""Helper function that converts an HTML string."""
# Parse the string
soup = BeautifulSoup(html_content, "html.parser")
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
# Remove javascript and style blocks
for script in soup(["script", "style"]):
@ -96,3 +73,22 @@ class HtmlConverter(DocumentConverter):
markdown=webpage_text,
title=None if soup.title is None else soup.title.string,
)
def convert_string(
self, html_content: str, *, url: Optional[str] = None, **kwargs
) -> DocumentConverterResult:
"""
Non-standard convenience method to convert a string to markdown.
Given that many converters produce HTML as intermediate output, this
allows for easy conversion of HTML to markdown.
"""
return self.convert(
file_stream=io.BytesIO(html_content.encode("utf-8")),
stream_info=StreamInfo(
mimetype="text/html",
extension=".html",
charset="utf-8",
url=url,
),
**kwargs,
)

View file

@ -1,12 +1,13 @@
import sys
import base64
import re
import html
import sys
from typing import Union
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
@ -19,7 +20,14 @@ except ImportError:
_dependency_exc_info = sys.exc_info()
class PptxConverter(HtmlConverter):
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.presentationml",
]
ACCEPTED_FILE_EXTENSIONS = [".pptx"]
class PptxConverter(DocumentConverter):
"""
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""
@ -28,6 +36,7 @@ class PptxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
self._html_converter = HtmlConverter()
def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
@ -58,12 +67,30 @@ class PptxConverter(HtmlConverter):
)
return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pptx":
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
@ -76,7 +103,8 @@ class PptxConverter(HtmlConverter):
_dependency_exc_info[2]
) # Restore the original traceback
presentation = pptx.Presentation(local_path)
# Perform the conversion
presentation = pptx.Presentation(file_stream)
md_content = ""
slide_num = 0
for slide in presentation.slides:
@ -130,21 +158,7 @@ class PptxConverter(HtmlConverter):
# Tables
if self._is_table(shape):
html_table = "<html><body><table>"
first_row = True
for row in shape.table.rows:
html_table += "<tr>"
for cell in row.cells:
if first_row:
html_table += "<th>" + html.escape(cell.text) + "</th>"
else:
html_table += "<td>" + html.escape(cell.text) + "</td>"
html_table += "</tr>"
first_row = False
html_table += "</table></body></html>"
md_content += (
"\n" + self._convert(html_table).text_content.strip() + "\n"
)
md_content += self._convert_table_to_markdown(shape.table)
# Charts
if shape.has_chart:
@ -189,6 +203,23 @@ class PptxConverter(HtmlConverter):
return True
return False
def _convert_table_to_markdown(self, table):
# Write the table as HTML, then convert it to Markdown
html_table = "<html><body><table>"
first_row = True
for row in table.rows:
html_table += "<tr>"
for cell in row.cells:
if first_row:
html_table += "<th>" + html.escape(cell.text) + "</th>"
else:
html_table += "<td>" + html.escape(cell.text) + "</td>"
html_table += "</tr>"
first_row = False
html_table += "</table></body></html>"
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
def _convert_chart_to_markdown(self, chart):
md = "\n\n### Chart"
if chart.has_title:

View file

@ -1,11 +1,22 @@
import io
import re
from typing import Any, Union
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/html",
"application/xhtml",
]
ACCEPTED_FILE_EXTENSIONS = [
".html",
".htm",
]
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""
@ -15,21 +26,42 @@ class WikipediaConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not Wikipedia
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]:
return None
url = kwargs.get("url", "")
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
"""
Make sure we're dealing with HTML content *from* Wikipedia.
"""
# Parse the file
soup = None
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
url = (stream_info.url or "").lower()
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
# Not a Wikipedia URL
return False
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Not HTML content
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
# Remove javascript and style blocks
for script in soup(["script", "style"]):

View file

@ -1,10 +1,9 @@
import sys
from typing import Union
from .._base_converter import DocumentConverter, DocumentConverterResult
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._stream_info import StreamInfo
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
@ -22,8 +21,19 @@ try:
except ImportError:
_xls_dependency_exc_info = sys.exc_info()
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
class XlsxConverter(HtmlConverter):
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
"application/vnd.ms-excel",
"application/excel",
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
class XlsxConverter(DocumentConverter):
"""
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
@ -32,13 +42,32 @@ class XlsxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
self._html_converter = HtmlConverter()
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".xlsx":
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
@ -51,27 +80,54 @@ class XlsxConverter(HtmlConverter):
_xlsx_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += self._convert(html_content).text_content.strip() + "\n\n"
md_content += (
self._html_converter.convert_string(html_content).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip())
class XlsConverter(HtmlConverter):
class XlsConverter(DocumentConverter):
"""
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLS
extension = kwargs.get("file_extension", "")
if extension.lower() != ".xls":
return None
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Load the dependencies
if _xls_dependency_exc_info is not None:
raise MissingDependencyException(
@ -84,11 +140,14 @@ class XlsConverter(HtmlConverter):
_xls_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += self._convert(html_content).text_content.strip() + "\n\n"
md_content += (
self._html_converter.convert_string(html_content).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip())