From 1161c30ba3f7bf6c2168ab387f805058902a121d Mon Sep 17 00:00:00 2001 From: Nishith Jain <167524748+KingNish24@users.noreply.github.com> Date: Tue, 11 Feb 2025 18:46:00 +0530 Subject: [PATCH 1/4] Update _markitdown.py --- .../markitdown/src/markitdown/_markitdown.py | 77 +++++++++++++++---- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index b7ac5bc..cc5759a 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -73,6 +73,19 @@ def _load_plugins() -> Union[None | List[Any]]: return _plugins +def isBase64(sb): + """ + checks if the input object is base64 + """ + try: + if isinstance(sb, str): + sb = re.sub(r"^data:.*base64,", "", sb) + sb_bytes = bytes(sb, "ascii") + elif isinstance(sb, bytes): + sb_bytes = sb + return base64.b64encode(base64.b64decode(sb_bytes)) == sb_bytes + except Exception: + return False class MarkItDown: """(In preview) An extremely simple text-based document reader, suitable for LLM use. @@ -175,29 +188,65 @@ class MarkItDown: warn("Plugins converters are already enabled.", RuntimeWarning) def convert( - self, source: Union[str, requests.Response, Path], **kwargs: Any + self, source: Union[str, requests.Response, Path], input_type: Literal["auto", "local_file", "url", "base64", "bytes", "request_response"] = "auto", **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object + - input_type: specifies the input type. If set to "auto", the function will try to automatically determine the type. - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) """ - # Local path or url - if isinstance(source, str): - if ( - source.startswith("http://") - or source.startswith("https://") - or source.startswith("file://") - ): - return self.convert_url(source, **kwargs) + if input_type == "auto": + # Check if source is Local path or url + if isinstance(source, str): + if ( + source.startswith("http://") + or source.startswith("https://") + or source.startswith("file://") + ): + input_type = "url" + elif os.path.isfile(source): + input_type = "local_file" + elif isBase64(source): + input_type = "base64" + # Check if source is a Request response + elif isinstance(source, requests.Response): + input_type = "request_response" + # Check if source is a local file path + elif isinstance(source, Path): + input_type = "local_file" + # Check if source is a Base64 encoded string + elif isBase64(source): + input_type = "base64" + # Check if source is a bytes object + elif isinstance(source, bytes): + input_type = "bytes" else: - return self.convert_local(source, **kwargs) - # Request response - elif isinstance(source, requests.Response): - return self.convert_response(source, **kwargs) - elif isinstance(source, Path): + raise ValueError(f"Unable to determine input type: {type(source)}") + + elif input_type == "url": + return self.convert_url(source, **kwargs) + elif input_type == "local_file": return self.convert_local(source, **kwargs) + elif input_type == "bytes" or input_type == "base64": + if input_type == "base64": + source = re.sub(r"^data:.*base64,", "", source) + source = base64.b64decode(source) + try: + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + tmp_file.write(source) + tmp_file.flush() # Ensure data is written to file + return self.convert_local(tmp_file.name, **kwargs) + except Exception as e: + raise e + finally: + if os.path.exists(tmp_file.name): + os.remove(tmp_file.name) + elif input_type == "request_response": + return self.convert_response(source, **kwargs) + else: + raise ValueError(f"Invalid input type: {input_type}") def convert_local( self, path: Union[str, Path], **kwargs: Any From 1cf8b26577b6aad64c754694c82fd2d0cc461581 Mon Sep 17 00:00:00 2001 From: Nishith Jain <167524748+KingNish24@users.noreply.github.com> Date: Tue, 11 Feb 2025 18:53:08 +0530 Subject: [PATCH 2/4] a liittle fix --- packages/markitdown/src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index cc5759a..93f0b3d 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -225,7 +225,7 @@ class MarkItDown: else: raise ValueError(f"Unable to determine input type: {type(source)}") - elif input_type == "url": + if input_type == "url": return self.convert_url(source, **kwargs) elif input_type == "local_file": return self.convert_local(source, **kwargs) From b86174d8ff269b1e68e67792f660a2ea9a43488b Mon Sep 17 00:00:00 2001 From: Nishith Jain <167524748+KingNish24@users.noreply.github.com> Date: Tue, 11 Feb 2025 18:56:20 +0530 Subject: [PATCH 3/4] Update _markitdown.py --- packages/markitdown/src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 93f0b3d..538fc1b 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -6,7 +6,7 @@ import tempfile import warnings import traceback from importlib.metadata import entry_points -from typing import Any, List, Optional, Union +from typing import Any, List, Optional, Union, Literal from pathlib import Path from urllib.parse import urlparse from warnings import warn From d92cd2b2a7106eea8d64be647b1707c3b4e4137a Mon Sep 17 00:00:00 2001 From: Nishith Jain <167524748+KingNish24@users.noreply.github.com> Date: Tue, 11 Feb 2025 19:04:45 +0530 Subject: [PATCH 4/4] Updated pip install from source to single line --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 8ac2fe3..2ac5fd1 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,7 @@ It supports: To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: ```bash -git clone git@github.com:microsoft/markitdown.git -cd markitdown -pip install -e packages/markitdown +pip install git+https://github.com/microsoft/markitdown.git#subdirectory=packages/markitdown ``` ## Usage