From 0f948ade4057e30636e42d5201a2b1b03a0bc63a Mon Sep 17 00:00:00 2001 From: ZeyuTeng96 <96521059+ZeyuTeng96@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:10:40 +0800 Subject: [PATCH] JsonConverter for Converting JSON Files into Structured Markdown Files Converts Jsons to Markdown. The output preserves the structure of the JSON file as closely as possible, while using Markdown syntax for readability. --- src/markitdown/_markitdown.py | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..31b659a 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1204,6 +1204,42 @@ class ZipConverter(DocumentConverter): text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", ) +class JsonConverter(DocumentConverter): + """ + Converts Jsons to Markdown. The output preserves the structure of the JSON file as closely as possible, while using Markdown syntax for readability. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a Json + extension = kwargs.get("file_extension", "") + if extension.lower() != ".json": + return None + + list_heading = kwargs.get("list_heading", "Elem") + with open(local_path, "r") as f: + json_data = json.load(f) + + md_content = "" + + if isinstance(json_data, dict): + md_content += json.dumps(json_data, indent=4, ensure_ascii=False) + "\n" + elif isinstance(json_data, list): + for idx, item in enumerate(json_data, start=1): + md_content += f"# {list_heading} {idx}\n" + if isinstance(item, (dict, list)): + md_content += json.dumps(item, indent=4, ensure_ascii=False) + "\n\n" + else: + md_content += f"{item}\n\n" + else: + md_content += f"{json_data}\n" + + # removing tailing \n + md_content = md_content.strip() + + return DocumentConverterResult( + title=None, + text_content=md_content, + ) class FileConversionException(BaseException): pass @@ -1276,6 +1312,7 @@ class MarkItDown: self.register_page_converter(WikipediaConverter()) self.register_page_converter(YouTubeConverter()) self.register_page_converter(BingSerpConverter()) + self.register_page_converter(JsonConverter()) self.register_page_converter(DocxConverter()) self.register_page_converter(XlsxConverter()) self.register_page_converter(PptxConverter())