From fe1d57a06f976d06b2c6798b1a71ce6ea477cc6e Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Wed, 5 Mar 2025 15:12:13 -0800 Subject: [PATCH] Updated DocumentConverter documentation. --- .../markitdown/src/markitdown/_base_converter.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index 9079843..f4fb3a1 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -86,7 +86,7 @@ class DocumentConverter: """ Return a quick determination on if the converter should attempt converting the document. This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`). - In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to + In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to make a determination (e.g., special converters for Wikipedia, YouTube etc). Finally, it is conceivable that the `stream_info.filename` might be used to in cases where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc) @@ -94,8 +94,15 @@ class DocumentConverter: NOTE: The method signature is designed to match that of the convert() method. This provides some assurance that, if accepts() returns True, the convert() method will also be able to handle the document. - IMPORTANT: If this method advances the position in file_stream, it must also reset the position before - returning. This is because the convert() method may be called immediately after accepts(). + IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final + determination. Read operations inevitably advances the position in file_stream. In these case, the position + MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately + after accepts(), and will expect the file_stream to be at the original position. + + E.g., + cur_pos = file_stream.tell() # Save the current position + data = file_stream.read(100) # ... peek at the first 100 bytes, etc. + file_stream.seek(cur_pos) # Reset the position to the original position Prameters: - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.