From 979cdc6257b31a1f71c95634e96da1a96895f262 Mon Sep 17 00:00:00 2001 From: Hieu Lam Date: Wed, 15 Jan 2025 12:37:16 +0700 Subject: [PATCH] feat: support images in table and auto detect code languages (optional) --- src/markitdown/_markitdown.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index e68b099..cf8149d 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -76,6 +76,17 @@ except ModuleNotFoundError: pass +try: + from guesslang import Guess +except ImportError: + warn("The 'guesslang' package is not installed. Please install it via 'pip install guesslang'.") + class Guess: + def language_name(self, code: str) -> str: + return "" + +guess = Guess() + + class _CustomMarkdownify(markdownify.MarkdownConverter): """ A custom version of markdownify's MarkdownConverter. Changes include: @@ -88,6 +99,19 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) + + # Keep inline images in table elements + options["keep_inline_images_in"] = options.get("keep_inline_images_in", ["td", "tr", "div", "p", "span"]) + + # Add a custom code language callback to guess the language of code snippets + def code_language_callback(el): + extracted_code_snippet = el.get_text() + if not extracted_code_snippet: + return "" + language = guess.language_name(extracted_code_snippet) + return language.lower() if language else "" + options["code_language_callback"] = options.get("code_language_callback", code_language_callback) + # Explicitly cast options to the expected type if necessary super().__init__(**options)