according to this issue ``https://github.com/microsoft/markitdown/issues/23`` added doc DocConverter function
This commit is contained in:
parent
81e3f24acd
commit
81df7599c7
1 changed files with 77 additions and 47 deletions
|
|
@ -299,7 +299,7 @@ class YouTubeConverter(DocumentConverter):
|
||||||
obj_start = lines[0].find("{")
|
obj_start = lines[0].find("{")
|
||||||
obj_end = lines[0].rfind("}")
|
obj_end = lines[0].rfind("}")
|
||||||
if obj_start >= 0 and obj_end >= 0:
|
if obj_start >= 0 and obj_end >= 0:
|
||||||
data = json.loads(lines[0][obj_start : obj_end + 1])
|
data = json.loads(lines[0][obj_start: obj_end + 1])
|
||||||
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
||||||
if attrdesc:
|
if attrdesc:
|
||||||
metadata["description"] = str(attrdesc["content"])
|
metadata["description"] = str(attrdesc["content"])
|
||||||
|
|
@ -499,6 +499,35 @@ class DocxConverter(HtmlConverter):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class DocConverter(HtmlConverter):
|
||||||
|
"""
|
||||||
|
Converts DOC files to Markdown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
"""
|
||||||
|
Converts a DOC file to Markdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
local_path (str): The path to the DOC file.
|
||||||
|
**kwargs: Additional arguments that may include 'file_extension'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[None, DocumentConverterResult]: The conversion result or None if the file is not a DOC.
|
||||||
|
"""
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() != ".doc":
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = None
|
||||||
|
with open(local_path, "rb") as doc_file:
|
||||||
|
result = mammoth.convert_to_html(doc_file)
|
||||||
|
html_content = result.value
|
||||||
|
result = self._convert(html_content)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
class XlsxConverter(HtmlConverter):
|
||||||
"""
|
"""
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
|
|
@ -874,6 +903,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(YouTubeConverter())
|
self.register_page_converter(YouTubeConverter())
|
||||||
self.register_page_converter(BingSerpConverter())
|
self.register_page_converter(BingSerpConverter())
|
||||||
self.register_page_converter(DocxConverter())
|
self.register_page_converter(DocxConverter())
|
||||||
|
self.register_page_converter(DocConverter())
|
||||||
self.register_page_converter(XlsxConverter())
|
self.register_page_converter(XlsxConverter())
|
||||||
self.register_page_converter(PptxConverter())
|
self.register_page_converter(PptxConverter())
|
||||||
self.register_page_converter(WavConverter())
|
self.register_page_converter(WavConverter())
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue