Merge remote-tracking branch 'origin/feat-optional_b64' into feat-optional_b64

This commit is contained in:
Yuzhong Zhang 2025-03-21 00:49:55 +08:00
commit e952ab1189

View file

@ -17,12 +17,16 @@ except ImportError:
ACCEPTED_MIME_TYPE_PREFIXES = [ ACCEPTED_MIME_TYPE_PREFIXES = [
"text/", "text/",
"application/json", "application/json",
"application/markdown",
] ]
# Mimetypes to ignore (commonly confused extensions) ACCEPTED_FILE_EXTENSIONS = [
IGNORE_MIME_TYPE_PREFIXES = [ ".txt",
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc. ".text",
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc. ".md",
".markdown",
".json",
".jsonl",
] ]
@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()
for prefix in IGNORE_MIME_TYPE_PREFIXES: # If we have a charset, we can safely assume it's text
if mimetype.startswith(prefix): # With Magika in the earlier stages, this handles most cases
return False if stream_info.charset is not None:
return True
# Otherwise, check the mimetype and extension
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES: for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix): if mimetype.startswith(prefix):