according to this issue ``https://github.com/microsoft/markitdown/issues/23`` added doc DocConverter function

This commit is contained in:
Aviral Bhardwaj 2024-12-15 19:34:31 +05:30
parent 81e3f24acd
commit 81df7599c7

View file

@ -499,6 +499,35 @@ class DocxConverter(HtmlConverter):
return result return result
class DocConverter(HtmlConverter):
"""
Converts DOC files to Markdown.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
"""
Converts a DOC file to Markdown.
Args:
local_path (str): The path to the DOC file.
**kwargs: Additional arguments that may include 'file_extension'.
Returns:
Union[None, DocumentConverterResult]: The conversion result or None if the file is not a DOC.
"""
extension = kwargs.get("file_extension", "")
if extension.lower() != ".doc":
return None
result = None
with open(local_path, "rb") as doc_file:
result = mammoth.convert_to_html(doc_file)
html_content = result.value
result = self._convert(html_content)
return result
class XlsxConverter(HtmlConverter): class XlsxConverter(HtmlConverter):
""" """
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
@ -874,6 +903,7 @@ class MarkItDown:
self.register_page_converter(YouTubeConverter()) self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter()) self.register_page_converter(BingSerpConverter())
self.register_page_converter(DocxConverter()) self.register_page_converter(DocxConverter())
self.register_page_converter(DocConverter())
self.register_page_converter(XlsxConverter()) self.register_page_converter(XlsxConverter())
self.register_page_converter(PptxConverter()) self.register_page_converter(PptxConverter())
self.register_page_converter(WavConverter()) self.register_page_converter(WavConverter())