according to this issue ``https://github.com/microsoft/markitdown/issues/23`` added doc DocConverter function
This commit is contained in:
parent
81e3f24acd
commit
81df7599c7
1 changed files with 77 additions and 47 deletions
|
|
@ -88,10 +88,10 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
|
||||||
# For the replacement see #29: text nodes underscores are escaped
|
# For the replacement see #29: text nodes underscores are escaped
|
||||||
if (
|
if (
|
||||||
self.options["autolinks"]
|
self.options["autolinks"]
|
||||||
and text.replace(r"\_", "_") == href
|
and text.replace(r"\_", "_") == href
|
||||||
and not title
|
and not title
|
||||||
and not self.options["default_title"]
|
and not self.options["default_title"]
|
||||||
):
|
):
|
||||||
# Shortcut syntax
|
# Shortcut syntax
|
||||||
return "<%s>" % href
|
return "<%s>" % href
|
||||||
|
|
@ -112,8 +112,8 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
title = el.attrs.get("title", None) or ""
|
title = el.attrs.get("title", None) or ""
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
if (
|
if (
|
||||||
convert_as_inline
|
convert_as_inline
|
||||||
and el.parent.name not in self.options["keep_inline_images_in"]
|
and el.parent.name not in self.options["keep_inline_images_in"]
|
||||||
):
|
):
|
||||||
return alt
|
return alt
|
||||||
|
|
||||||
|
|
@ -139,7 +139,7 @@ class DocumentConverter:
|
||||||
"""Abstract superclass of all DocumentConverters."""
|
"""Abstract superclass of all DocumentConverters."""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
@ -148,7 +148,7 @@ class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Guess the content type from any file extension that might be around
|
# Guess the content type from any file extension that might be around
|
||||||
content_type, _ = mimetypes.guess_type(
|
content_type, _ = mimetypes.guess_type(
|
||||||
|
|
@ -174,7 +174,7 @@ class HtmlConverter(DocumentConverter):
|
||||||
"""Anything with content type text/html"""
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not html
|
# Bail if not html
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|
@ -217,7 +217,7 @@ class WikipediaConverter(DocumentConverter):
|
||||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not Wikipedia
|
# Bail if not Wikipedia
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|
@ -266,7 +266,7 @@ class YouTubeConverter(DocumentConverter):
|
||||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not YouTube
|
# Bail if not YouTube
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|
@ -299,7 +299,7 @@ class YouTubeConverter(DocumentConverter):
|
||||||
obj_start = lines[0].find("{")
|
obj_start = lines[0].find("{")
|
||||||
obj_end = lines[0].rfind("}")
|
obj_end = lines[0].rfind("}")
|
||||||
if obj_start >= 0 and obj_end >= 0:
|
if obj_start >= 0 and obj_end >= 0:
|
||||||
data = json.loads(lines[0][obj_start : obj_end + 1])
|
data = json.loads(lines[0][obj_start: obj_end + 1])
|
||||||
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
||||||
if attrdesc:
|
if attrdesc:
|
||||||
metadata["description"] = str(attrdesc["content"])
|
metadata["description"] = str(attrdesc["content"])
|
||||||
|
|
@ -364,10 +364,10 @@ class YouTubeConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get(
|
def _get(
|
||||||
self,
|
self,
|
||||||
metadata: Dict[str, str],
|
metadata: Dict[str, str],
|
||||||
keys: List[str],
|
keys: List[str],
|
||||||
default: Union[str, None] = None,
|
default: Union[str, None] = None,
|
||||||
) -> Union[str, None]:
|
) -> Union[str, None]:
|
||||||
for k in keys:
|
for k in keys:
|
||||||
if k in metadata:
|
if k in metadata:
|
||||||
|
|
@ -435,7 +435,7 @@ class BingSerpConverter(DocumentConverter):
|
||||||
# but appears to be base64 encoded, with some prefix
|
# but appears to be base64 encoded, with some prefix
|
||||||
if "u" in qs:
|
if "u" in qs:
|
||||||
u = (
|
u = (
|
||||||
qs["u"][0][2:].strip() + "=="
|
qs["u"][0][2:].strip() + "=="
|
||||||
) # Python 3 doesn't care about extra padding
|
) # Python 3 doesn't care about extra padding
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -452,8 +452,8 @@ class BingSerpConverter(DocumentConverter):
|
||||||
results.append("\n".join([line for line in lines if len(line) > 0]))
|
results.append("\n".join([line for line in lines if len(line) > 0]))
|
||||||
|
|
||||||
webpage_text = (
|
webpage_text = (
|
||||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||||
+ "\n\n".join(results)
|
+ "\n\n".join(results)
|
||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
|
@ -499,6 +499,35 @@ class DocxConverter(HtmlConverter):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class DocConverter(HtmlConverter):
|
||||||
|
"""
|
||||||
|
Converts DOC files to Markdown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
"""
|
||||||
|
Converts a DOC file to Markdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
local_path (str): The path to the DOC file.
|
||||||
|
**kwargs: Additional arguments that may include 'file_extension'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[None, DocumentConverterResult]: The conversion result or None if the file is not a DOC.
|
||||||
|
"""
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() != ".doc":
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = None
|
||||||
|
with open(local_path, "rb") as doc_file:
|
||||||
|
result = mammoth.convert_to_html(doc_file)
|
||||||
|
html_content = result.value
|
||||||
|
result = self._convert(html_content)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
class XlsxConverter(HtmlConverter):
|
||||||
"""
|
"""
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
|
|
@ -557,11 +586,11 @@ class PptxConverter(HtmlConverter):
|
||||||
# A placeholder name
|
# A placeholder name
|
||||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
md_content += (
|
md_content += (
|
||||||
"\n\n"
|
+ ")\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
|
|
@ -579,7 +608,7 @@ class PptxConverter(HtmlConverter):
|
||||||
first_row = False
|
first_row = False
|
||||||
html_table += "</table></body></html>"
|
html_table += "</table></body></html>"
|
||||||
md_content += (
|
md_content += (
|
||||||
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Text areas
|
# Text areas
|
||||||
|
|
@ -788,15 +817,15 @@ class ImageConverter(MediaConverter):
|
||||||
mlm_model = kwargs.get("mlm_model")
|
mlm_model = kwargs.get("mlm_model")
|
||||||
if mlm_client is not None and mlm_model is not None:
|
if mlm_client is not None and mlm_model is not None:
|
||||||
md_content += (
|
md_content += (
|
||||||
"\n# Description:\n"
|
"\n# Description:\n"
|
||||||
+ self._get_mlm_description(
|
+ self._get_mlm_description(
|
||||||
local_path,
|
local_path,
|
||||||
extension,
|
extension,
|
||||||
mlm_client,
|
mlm_client,
|
||||||
mlm_model,
|
mlm_model,
|
||||||
prompt=kwargs.get("mlm_prompt"),
|
prompt=kwargs.get("mlm_prompt"),
|
||||||
).strip()
|
).strip()
|
||||||
+ "\n"
|
+ "\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
|
@ -850,10 +879,10 @@ class MarkItDown:
|
||||||
This reader will convert common file-types or webpages to Markdown."""
|
This reader will convert common file-types or webpages to Markdown."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
requests_session: Optional[requests.Session] = None,
|
requests_session: Optional[requests.Session] = None,
|
||||||
mlm_client: Optional[Any] = None,
|
mlm_client: Optional[Any] = None,
|
||||||
mlm_model: Optional[Any] = None,
|
mlm_model: Optional[Any] = None,
|
||||||
):
|
):
|
||||||
if requests_session is None:
|
if requests_session is None:
|
||||||
self._requests_session = requests.Session()
|
self._requests_session = requests.Session()
|
||||||
|
|
@ -874,6 +903,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(YouTubeConverter())
|
self.register_page_converter(YouTubeConverter())
|
||||||
self.register_page_converter(BingSerpConverter())
|
self.register_page_converter(BingSerpConverter())
|
||||||
self.register_page_converter(DocxConverter())
|
self.register_page_converter(DocxConverter())
|
||||||
|
self.register_page_converter(DocConverter())
|
||||||
self.register_page_converter(XlsxConverter())
|
self.register_page_converter(XlsxConverter())
|
||||||
self.register_page_converter(PptxConverter())
|
self.register_page_converter(PptxConverter())
|
||||||
self.register_page_converter(WavConverter())
|
self.register_page_converter(WavConverter())
|
||||||
|
|
@ -882,7 +912,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -893,9 +923,9 @@ class MarkItDown:
|
||||||
# Local path or url
|
# Local path or url
|
||||||
if isinstance(source, str):
|
if isinstance(source, str):
|
||||||
if (
|
if (
|
||||||
source.startswith("http://")
|
source.startswith("http://")
|
||||||
or source.startswith("https://")
|
or source.startswith("https://")
|
||||||
or source.startswith("file://")
|
or source.startswith("file://")
|
||||||
):
|
):
|
||||||
return self.convert_url(source, **kwargs)
|
return self.convert_url(source, **kwargs)
|
||||||
else:
|
else:
|
||||||
|
|
@ -905,7 +935,7 @@ class MarkItDown:
|
||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: str, **kwargs: Any
|
self, path: str, **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
|
|
@ -923,7 +953,7 @@ class MarkItDown:
|
||||||
|
|
||||||
# TODO what should stream's type be?
|
# TODO what should stream's type be?
|
||||||
def convert_stream(
|
def convert_stream(
|
||||||
self, stream: Any, **kwargs: Any
|
self, stream: Any, **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
|
|
@ -959,7 +989,7 @@ class MarkItDown:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def convert_url(
|
def convert_url(
|
||||||
self, url: str, **kwargs: Any
|
self, url: str, **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: fix kwargs type
|
) -> DocumentConverterResult: # TODO: fix kwargs type
|
||||||
# Send a HTTP request to the URL
|
# Send a HTTP request to the URL
|
||||||
response = self._requests_session.get(url, stream=True)
|
response = self._requests_session.get(url, stream=True)
|
||||||
|
|
@ -967,7 +997,7 @@ class MarkItDown:
|
||||||
return self.convert_response(response, **kwargs)
|
return self.convert_response(response, **kwargs)
|
||||||
|
|
||||||
def convert_response(
|
def convert_response(
|
||||||
self, response: requests.Response, **kwargs: Any
|
self, response: requests.Response, **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO fix kwargs type
|
) -> DocumentConverterResult: # TODO fix kwargs type
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
|
|
@ -1015,7 +1045,7 @@ class MarkItDown:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
error_trace = ""
|
error_trace = ""
|
||||||
for ext in extensions + [None]: # Try last with no extension
|
for ext in extensions + [None]: # Try last with no extension
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue