From 6feccc82eb4be5a1c4f95601054e6daa33d2b2b8 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Wed, 19 Mar 2025 07:50:23 -0700 Subject: [PATCH] Consider anything with a charset as plain text-convertable. --- .../converters/_plain_text_converter.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 4a21d3a..2e10405 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -17,12 +17,16 @@ except ImportError: ACCEPTED_MIME_TYPE_PREFIXES = [ "text/", "application/json", + "application/markdown", ] -# Mimetypes to ignore (commonly confused extensions) -IGNORE_MIME_TYPE_PREFIXES = [ - "text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc. - "text/vnd.graphviz", # .dot which is confused with xls, doc, etc. +ACCEPTED_FILE_EXTENSIONS = [ + ".txt", + ".text", + ".md", + ".markdown", + ".json", + ".jsonl", ] @@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter): mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() - for prefix in IGNORE_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return False + # If we have a charset, we can safely assume it's text + # With Magika in the earlier stages, this handles most cases + if stream_info.charset is not None: + return True + + # Otherwise, check the mimetype and extension + if extension in ACCEPTED_FILE_EXTENSIONS: + return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix):