From 979cdc6257b31a1f71c95634e96da1a96895f262 Mon Sep 17 00:00:00 2001 From: Hieu Lam Date: Wed, 15 Jan 2025 12:37:16 +0700 Subject: [PATCH 1/3] feat: support images in table and auto detect code languages (optional) --- src/markitdown/_markitdown.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index e68b099..cf8149d 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -76,6 +76,17 @@ except ModuleNotFoundError: pass +try: + from guesslang import Guess +except ImportError: + warn("The 'guesslang' package is not installed. Please install it via 'pip install guesslang'.") + class Guess: + def language_name(self, code: str) -> str: + return "" + +guess = Guess() + + class _CustomMarkdownify(markdownify.MarkdownConverter): """ A custom version of markdownify's MarkdownConverter. Changes include: @@ -88,6 +99,19 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) + + # Keep inline images in table elements + options["keep_inline_images_in"] = options.get("keep_inline_images_in", ["td", "tr", "div", "p", "span"]) + + # Add a custom code language callback to guess the language of code snippets + def code_language_callback(el): + extracted_code_snippet = el.get_text() + if not extracted_code_snippet: + return "" + language = guess.language_name(extracted_code_snippet) + return language.lower() if language else "" + options["code_language_callback"] = options.get("code_language_callback", code_language_callback) + # Explicitly cast options to the expected type if necessary super().__init__(**options) From a75f1a68fb95ca232568f1cd14f73b0f91357d6d Mon Sep 17 00:00:00 2001 From: Hieu Lam Date: Wed, 15 Jan 2025 12:50:40 +0700 Subject: [PATCH 2/3] chore: added short description in README.md about the feature --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 76a4d3f..252d746 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,12 @@ result = md.convert("example.jpg") print(result.text_content) ``` +### Extensions + +#### Automatic Code Language Detection + +Install `guesslang` with the command `pip install guesslang` to enable automatic code language recognition, especially useful for converting documents from web pages. + ### Docker ```sh From 95da5fd2ae5102a9dcf06ccbc6dc314abdedeb4a Mon Sep 17 00:00:00 2001 From: Hieu Lam Date: Mon, 10 Feb 2025 19:00:14 +0700 Subject: [PATCH 3/3] chore: delete to separate the code format guess part to another PR --- README.md | 6 ------ src/markitdown/_markitdown.py | 28 +++++----------------------- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 252d746..76a4d3f 100644 --- a/README.md +++ b/README.md @@ -81,12 +81,6 @@ result = md.convert("example.jpg") print(result.text_content) ``` -### Extensions - -#### Automatic Code Language Detection - -Install `guesslang` with the command `pip install guesslang` to enable automatic code language recognition, especially useful for converting documents from web pages. - ### Docker ```sh diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index cf8149d..8cd4b49 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -76,17 +76,6 @@ except ModuleNotFoundError: pass -try: - from guesslang import Guess -except ImportError: - warn("The 'guesslang' package is not installed. Please install it via 'pip install guesslang'.") - class Guess: - def language_name(self, code: str) -> str: - return "" - -guess = Guess() - - class _CustomMarkdownify(markdownify.MarkdownConverter): """ A custom version of markdownify's MarkdownConverter. Changes include: @@ -99,19 +88,12 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) - - # Keep inline images in table elements - options["keep_inline_images_in"] = options.get("keep_inline_images_in", ["td", "tr", "div", "p", "span"]) - # Add a custom code language callback to guess the language of code snippets - def code_language_callback(el): - extracted_code_snippet = el.get_text() - if not extracted_code_snippet: - return "" - language = guess.language_name(extracted_code_snippet) - return language.lower() if language else "" - options["code_language_callback"] = options.get("code_language_callback", code_language_callback) - + # Keep inline images in table elements + options["keep_inline_images_in"] = options.get( + "keep_inline_images_in", ["td", "tr", "div", "p", "span"] + ) + # Explicitly cast options to the expected type if necessary super().__init__(**options)