From 65b3f4a152d78cee1b1fc93e1fa31b34a110a871 Mon Sep 17 00:00:00 2001 From: Hieu Lam Date: Mon, 10 Mar 2025 10:28:24 +0700 Subject: [PATCH] chore: using `magika` instead of `guesslang` --- .../src/markitdown/converters/_markdownify.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index ae99c0b..b641fbb 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -3,7 +3,9 @@ import markdownify from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse +from magika import Magika +magika = Magika() class _CustomMarkdownify(markdownify.MarkdownConverter): """ @@ -17,6 +19,22 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) + + # Add a custom code language callback to guess the language of code snippets + def code_language_callback(el): + extracted_code_snippet = el.get_text() + if not extracted_code_snippet: + return "" + result = magika.identify_bytes(extracted_code_snippet.encode()) + if result.status == "ok" and result.prediction.output.group in ["text", "code"]: + language = result.prediction.output.label + return language + return "" + + options["code_language_callback"] = options.get( + "code_language_callback", code_language_callback + ) + # Explicitly cast options to the expected type if necessary super().__init__(**options)