diff --git a/README.md b/README.md index df7189d..baaab5e 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,9 @@ It presently supports: - Audio (EXIF metadata, and speech transcription) - HTML (special handling of Wikipedia, etc.) - Various other text-based formats (csv, json, xml, etc.) +- OneNote (.one) + +Note: OneNote is not supported. # Installation @@ -51,6 +54,18 @@ result = md.convert("example.jpg") print(result.text_content) ``` +To convert OneNote files, you can use the following example: + +```python +from markitdown import MarkItDown + +markitdown = MarkItDown() +result = markitdown.convert("example.one") +print(result.text_content) +``` + +Note: For OneNote support, the `one-extract` package is used. + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/pyproject.toml b/pyproject.toml index 74df032..6eac949 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "youtube-transcript-api", "SpeechRecognition", "pathvalidate", + "onenote", ] [project.urls] @@ -76,3 +77,5 @@ exclude_lines = [ "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] + +# Note: OneNote is not supported. diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..f489bd9 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -21,6 +21,7 @@ import pandas as pd import pdfminer import pdfminer.high_level import pptx +import one_extract as onenote # File-format detection import puremagic @@ -617,6 +618,32 @@ class PptxConverter(HtmlConverter): return False +class OneNoteConverter(HtmlConverter): + """ + Converts OneNote files to Markdown. Supports heading, tables and images with alt text. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a OneNote file + extension = kwargs.get("file_extension", "") + if extension.lower() != ".one": + return None + + md_content = "" + + notebook = onenote.Notebook(local_path) + for section in notebook.sections: + md_content += f"\n\n# {section.name}\n" + for page in section.pages: + md_content += f"\n\n## {page.name}\n" + md_content += self._convert(page.content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + class MediaConverter(DocumentConverter): """ Abstract class for multi-modal media (e.g., images and audio) @@ -880,6 +907,7 @@ class MarkItDown: self.register_page_converter(Mp3Converter()) self.register_page_converter(ImageConverter()) self.register_page_converter(PdfConverter()) + self.register_page_converter(OneNoteConverter()) def convert( self, source: Union[str, requests.Response], **kwargs: Any diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 94fd886..4026e72 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -87,6 +87,15 @@ SERP_TEST_EXCLUDES = [ "data:image/svg+xml,%3Csvg%20width%3D", ] +ONENOTE_TEST_STRINGS = [ + "# Section 1", + "## Page 1", + "This is a test OneNote page.", + "# Section 2", + "## Page 2", + "Another test OneNote page.", +] + @pytest.mark.skipif( skip_remote, @@ -164,6 +173,12 @@ def test_markitdown_local() -> None: for test_string in SERP_TEST_STRINGS: assert test_string in text_content + # Test OneNote processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.one")) + text_content = result.text_content.replace("\\", "") + for test_string in ONENOTE_TEST_STRINGS: + assert test_string in text_content + @pytest.mark.skipif( skip_exiftool, @@ -179,7 +194,7 @@ def test_markitdown_exiftool() -> None: assert target in result.text_content -if __name__ == "__main__": +if __name__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() test_markitdown_local()