From bb34d937112361ce866486cad0c76326550d136e Mon Sep 17 00:00:00 2001 From: Aviral Bhardwaj <152312896+aviralbhardwaj007@users.noreply.github.com> Date: Sun, 15 Dec 2024 21:15:32 +0530 Subject: [PATCH] Added DOC to Markdown Converter Function - Issue #23 --- src/markitdown/_markitdown.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..5085e90 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -498,6 +498,30 @@ class DocxConverter(HtmlConverter): return result +class DocConverter(HtmlConverter): + """ + Converts DOC files to Markdown. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + """ + Args: + local_path (str): The path to the DOC file. + **kwargs: Additional arguments that may include 'file_extension'. + Returns: + Union[None, DocumentConverterResult]: The conversion result or None if the file is not a DOC. + """ + extension = kwargs.get("file_extension", "") + if extension.lower() != ".doc": + return None + + result = None + with open(local_path, "rb") as doc_file: + result = mammoth.convert_to_html(doc_file) + html_content = result.value + result = self._convert(html_content) + + return result class XlsxConverter(HtmlConverter): """ @@ -874,6 +898,7 @@ class MarkItDown: self.register_page_converter(YouTubeConverter()) self.register_page_converter(BingSerpConverter()) self.register_page_converter(DocxConverter()) + self.register_page_converter(DocConverter()) self.register_page_converter(XlsxConverter()) self.register_page_converter(PptxConverter()) self.register_page_converter(WavConverter())