chore: using magika instead of guesslang

This commit is contained in:
Hieu Lam 2025-03-10 10:28:24 +07:00
parent 58a687c08c
commit 65b3f4a152

View file

@ -3,7 +3,9 @@ import markdownify
from typing import Any, Optional from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse from urllib.parse import quote, unquote, urlparse, urlunparse
from magika import Magika
magika = Magika()
class _CustomMarkdownify(markdownify.MarkdownConverter): class _CustomMarkdownify(markdownify.MarkdownConverter):
""" """
@ -17,6 +19,22 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any): def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX) options["heading_style"] = options.get("heading_style", markdownify.ATX)
# Add a custom code language callback to guess the language of code snippets
def code_language_callback(el):
extracted_code_snippet = el.get_text()
if not extracted_code_snippet:
return ""
result = magika.identify_bytes(extracted_code_snippet.encode())
if result.status == "ok" and result.prediction.output.group in ["text", "code"]:
language = result.prediction.output.label
return language
return ""
options["code_language_callback"] = options.get(
"code_language_callback", code_language_callback
)
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)