From b40291d7472a148eaadc23bf9b7c9d6b6bdd2809 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Sun, 9 Feb 2025 20:42:58 -0800 Subject: [PATCH] Significant cleanup and refactor. --- src/markitdown/_markitdown.py | 66 +++---------------- .../converters/_bing_serp_converter.py | 4 +- .../converters/_doc_intel_converter.py | 6 -- src/markitdown/converters/_docx_converter.py | 3 +- src/markitdown/converters/_html_converter.py | 3 - src/markitdown/converters/_image_converter.py | 4 +- src/markitdown/converters/_ipynb_converter.py | 2 +- src/markitdown/converters/_markdownify.py | 4 +- src/markitdown/converters/_media_converter.py | 3 +- src/markitdown/converters/_mp3_converter.py | 6 +- src/markitdown/converters/_pdf_converter.py | 1 - src/markitdown/converters/_pptx_converter.py | 3 +- src/markitdown/converters/_rss_converter.py | 3 +- src/markitdown/converters/_wav_converter.py | 12 +--- .../converters/_wikipedia_converter.py | 1 - .../converters/_youtube_converter.py | 9 +-- src/markitdown/converters/_zip_converter.py | 7 -- 17 files changed, 24 insertions(+), 113 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 08b431f..13e8aaf 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1,49 +1,19 @@ -# type: ignore -import base64 -import binascii import copy -import html -import json import mimetypes import os import re -import shutil -import subprocess -import sys import tempfile -import traceback -import zipfile -import importlib -import sys -from importlib.metadata import entry_points -from xml.dom import minidom -from typing import Any, Dict, List, Optional, Union +from typing import Any, List, Optional, Union from pathlib import Path -from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse -from warnings import warn, resetwarnings, catch_warnings +from urllib.parse import urlparse +from warnings import warn -import mammoth -import markdownify -import olefile -import pandas as pd -import pdfminer -import pdfminer.high_level -import pptx # File-format detection import puremagic import requests -from bs4 import BeautifulSoup -from charset_normalizer import from_path # Azure imports -from azure.ai.documentintelligence import DocumentIntelligenceClient -from azure.ai.documentintelligence.models import ( - AnalyzeDocumentRequest, - AnalyzeResult, - DocumentAnalysisFeature, -) -from azure.identity import DefaultAzureCredential from .converters import ( DocumentConverter, @@ -67,11 +37,8 @@ from .converters import ( ZipConverter, DocumentIntelligenceConverter, ) -from .converters._markdownify import _CustomMarkdownify from ._exceptions import ( - MarkItDownException, - ConverterPrerequisiteException, FileConversionException, UnsupportedFormatException, ) @@ -151,7 +118,6 @@ class MarkItDown: self.register_page_converter(HtmlConverter()) self.register_page_converter(RssConverter()) self.register_page_converter(WikipediaConverter()) - self.register_page_converter(YouTubeConverter()) self.register_page_converter(BingSerpConverter()) self.register_page_converter(DocxConverter()) @@ -165,33 +131,17 @@ class MarkItDown: self.register_page_converter(PdfConverter()) self.register_page_converter(OutlookMsgConverter()) - # print("Discovering plugins") - # for entry_point in entry_points(group="markitdown.converters"): - # args = { - # "required1": "Override1", - # "required2": "Override2", - # "required3": "Override3" - # } - # - # #print(entry_point) - # plugin = entry_point.load() - # instance = plugin(**args) - # print(instance) - - # try: - # ConverterClass = entry_point.load() - # self.register_page_converter(ConverterClass()) - # print(f"✔ Registered converter: {entry_point.name}") - # except Exception as e: - # print(f" Failed to load {entry_point.name}: {e}") - # print("Done") - # Register Document Intelligence converter at the top of the stack if endpoint is provided if docintel_endpoint is not None: self.register_page_converter( DocumentIntelligenceConverter(endpoint=docintel_endpoint) ) + # print("Discovering plugins") + # for entry_point in entry_points(group="markitdown.converters"): + # #print(entry_point) + # plugin = entry_point.load() + def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs diff --git a/src/markitdown/converters/_bing_serp_converter.py b/src/markitdown/converters/_bing_serp_converter.py index 732f38a..b903724 100644 --- a/src/markitdown/converters/_bing_serp_converter.py +++ b/src/markitdown/converters/_bing_serp_converter.py @@ -2,8 +2,8 @@ import base64 import re -from typing import Any, Union -from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse +from typing import Union +from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult diff --git a/src/markitdown/converters/_doc_intel_converter.py b/src/markitdown/converters/_doc_intel_converter.py index 497dbdc..94acc9f 100644 --- a/src/markitdown/converters/_doc_intel_converter.py +++ b/src/markitdown/converters/_doc_intel_converter.py @@ -11,12 +11,6 @@ from azure.identity import DefaultAzureCredential from ._base import DocumentConverter, DocumentConverterResult -from .._exceptions import ( - MarkItDownException, - ConverterPrerequisiteException, - FileConversionException, - UnsupportedFormatException, -) # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # This constant is a temporary fix until the bug is resolved. diff --git a/src/markitdown/converters/_docx_converter.py b/src/markitdown/converters/_docx_converter.py index 766d1cb..fb61cca 100644 --- a/src/markitdown/converters/_docx_converter.py +++ b/src/markitdown/converters/_docx_converter.py @@ -1,9 +1,8 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Union import mammoth from ._base import ( - DocumentConverter, DocumentConverterResult, ) diff --git a/src/markitdown/converters/_html_converter.py b/src/markitdown/converters/_html_converter.py index faf3cd2..ae7259e 100644 --- a/src/markitdown/converters/_html_converter.py +++ b/src/markitdown/converters/_html_converter.py @@ -1,7 +1,4 @@ -import re - from typing import Any, Union -from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult diff --git a/src/markitdown/converters/_image_converter.py b/src/markitdown/converters/_image_converter.py index ca3a91d..f3dee6b 100644 --- a/src/markitdown/converters/_image_converter.py +++ b/src/markitdown/converters/_image_converter.py @@ -1,5 +1,5 @@ -from typing import Any, Dict, List, Optional, Union -from ._base import DocumentConverter, DocumentConverterResult +from typing import Union +from ._base import DocumentConverterResult from ._media_converter import MediaConverter diff --git a/src/markitdown/converters/_ipynb_converter.py b/src/markitdown/converters/_ipynb_converter.py index 75a215b..cdeb478 100644 --- a/src/markitdown/converters/_ipynb_converter.py +++ b/src/markitdown/converters/_ipynb_converter.py @@ -1,5 +1,5 @@ import json -from typing import Any, Dict, List, Optional, Union +from typing import Any, Union from ._base import ( DocumentConverter, diff --git a/src/markitdown/converters/_markdownify.py b/src/markitdown/converters/_markdownify.py index ae1f353..5b6d739 100644 --- a/src/markitdown/converters/_markdownify.py +++ b/src/markitdown/converters/_markdownify.py @@ -1,8 +1,8 @@ import re import markdownify -from typing import Any, Union -from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse +from typing import Any +from urllib.parse import quote, unquote, urlparse, urlunparse class _CustomMarkdownify(markdownify.MarkdownConverter): diff --git a/src/markitdown/converters/_media_converter.py b/src/markitdown/converters/_media_converter.py index 55dc038..07d2bde 100644 --- a/src/markitdown/converters/_media_converter.py +++ b/src/markitdown/converters/_media_converter.py @@ -1,10 +1,9 @@ -# type: ignore import subprocess import shutil import json from warnings import warn -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter class MediaConverter(DocumentConverter): diff --git a/src/markitdown/converters/_mp3_converter.py b/src/markitdown/converters/_mp3_converter.py index 34e839f..6b2786b 100644 --- a/src/markitdown/converters/_mp3_converter.py +++ b/src/markitdown/converters/_mp3_converter.py @@ -1,8 +1,8 @@ import tempfile -from typing import Any, Dict, List, Optional, Union -from ._base import DocumentConverter, DocumentConverterResult +from typing import Union +from ._base import DocumentConverterResult from ._wav_converter import WavConverter -from warnings import warn, resetwarnings, catch_warnings +from warnings import resetwarnings, catch_warnings # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False diff --git a/src/markitdown/converters/_pdf_converter.py b/src/markitdown/converters/_pdf_converter.py index 8a399db..dcffc62 100644 --- a/src/markitdown/converters/_pdf_converter.py +++ b/src/markitdown/converters/_pdf_converter.py @@ -1,4 +1,3 @@ -# type: ignore import pdfminer import pdfminer.high_level from typing import Union diff --git a/src/markitdown/converters/_pptx_converter.py b/src/markitdown/converters/_pptx_converter.py index abb4ed9..a48880a 100644 --- a/src/markitdown/converters/_pptx_converter.py +++ b/src/markitdown/converters/_pptx_converter.py @@ -1,4 +1,3 @@ -# type: ignore import base64 import pptx import re @@ -6,7 +5,7 @@ import html from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverterResult, DocumentConverter from ._html_converter import HtmlConverter diff --git a/src/markitdown/converters/_rss_converter.py b/src/markitdown/converters/_rss_converter.py index bf0d7c8..eb2f09c 100644 --- a/src/markitdown/converters/_rss_converter.py +++ b/src/markitdown/converters/_rss_converter.py @@ -1,6 +1,5 @@ -# type: ignore from xml.dom import minidom -from typing import Any, Dict, List, Optional, Union +from typing import Union from bs4 import BeautifulSoup from ._markdownify import _CustomMarkdownify diff --git a/src/markitdown/converters/_wav_converter.py b/src/markitdown/converters/_wav_converter.py index accf51e..6fc8932 100644 --- a/src/markitdown/converters/_wav_converter.py +++ b/src/markitdown/converters/_wav_converter.py @@ -1,25 +1,15 @@ from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverterResult from ._media_converter import MediaConverter -from warnings import warn, resetwarnings, catch_warnings # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: - # Using warnings' catch_warnings to catch - # pydub's warning of ffmpeg or avconv missing - with catch_warnings(record=True) as w: - import pydub - - if w: - raise ModuleNotFoundError import speech_recognition as sr IS_AUDIO_TRANSCRIPTION_CAPABLE = True except ModuleNotFoundError: pass -finally: - resetwarnings() class WavConverter(MediaConverter): diff --git a/src/markitdown/converters/_wikipedia_converter.py b/src/markitdown/converters/_wikipedia_converter.py index 729171c..4097ef0 100644 --- a/src/markitdown/converters/_wikipedia_converter.py +++ b/src/markitdown/converters/_wikipedia_converter.py @@ -1,7 +1,6 @@ import re from typing import Any, Union -from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult diff --git a/src/markitdown/converters/_youtube_converter.py b/src/markitdown/converters/_youtube_converter.py index 88d4017..fe198e8 100644 --- a/src/markitdown/converters/_youtube_converter.py +++ b/src/markitdown/converters/_youtube_converter.py @@ -1,18 +1,11 @@ import re from typing import Any, Union, Dict, List -from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse +from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup from ._base import DocumentConverter, DocumentConverterResult -from ._markdownify import _CustomMarkdownify -from .._exceptions import ( - MarkItDownException, - ConverterPrerequisiteException, - FileConversionException, - UnsupportedFormatException, -) # Optional YouTube transcription support try: diff --git a/src/markitdown/converters/_zip_converter.py b/src/markitdown/converters/_zip_converter.py index 24e42ad..918c357 100644 --- a/src/markitdown/converters/_zip_converter.py +++ b/src/markitdown/converters/_zip_converter.py @@ -5,13 +5,6 @@ from typing import Any, Union from ._base import DocumentConverter, DocumentConverterResult -from .._exceptions import ( - MarkItDownException, - ConverterPrerequisiteException, - FileConversionException, - UnsupportedFormatException, -) - class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files.