Significant cleanup and refactor.

This commit is contained in:
Adam Fourney 2025-02-09 20:42:58 -08:00
parent a795a16ce0
commit b40291d747
17 changed files with 24 additions and 113 deletions

View file

@ -1,49 +1,19 @@
# type: ignore
import base64
import binascii
import copy import copy
import html
import json
import mimetypes import mimetypes
import os import os
import re import re
import shutil
import subprocess
import sys
import tempfile import tempfile
import traceback from typing import Any, List, Optional, Union
import zipfile
import importlib
import sys
from importlib.metadata import entry_points
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import urlparse
from warnings import warn, resetwarnings, catch_warnings from warnings import warn
import mammoth
import markdownify
import olefile
import pandas as pd
import pdfminer
import pdfminer.high_level
import pptx
# File-format detection # File-format detection
import puremagic import puremagic
import requests import requests
from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Azure imports # Azure imports
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
from .converters import ( from .converters import (
DocumentConverter, DocumentConverter,
@ -67,11 +37,8 @@ from .converters import (
ZipConverter, ZipConverter,
DocumentIntelligenceConverter, DocumentIntelligenceConverter,
) )
from .converters._markdownify import _CustomMarkdownify
from ._exceptions import ( from ._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException, FileConversionException,
UnsupportedFormatException, UnsupportedFormatException,
) )
@ -151,7 +118,6 @@ class MarkItDown:
self.register_page_converter(HtmlConverter()) self.register_page_converter(HtmlConverter())
self.register_page_converter(RssConverter()) self.register_page_converter(RssConverter())
self.register_page_converter(WikipediaConverter()) self.register_page_converter(WikipediaConverter())
self.register_page_converter(YouTubeConverter()) self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter()) self.register_page_converter(BingSerpConverter())
self.register_page_converter(DocxConverter()) self.register_page_converter(DocxConverter())
@ -165,33 +131,17 @@ class MarkItDown:
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(OutlookMsgConverter()) self.register_page_converter(OutlookMsgConverter())
# print("Discovering plugins")
# for entry_point in entry_points(group="markitdown.converters"):
# args = {
# "required1": "Override1",
# "required2": "Override2",
# "required3": "Override3"
# }
#
# #print(entry_point)
# plugin = entry_point.load()
# instance = plugin(**args)
# print(instance)
# try:
# ConverterClass = entry_point.load()
# self.register_page_converter(ConverterClass())
# print(f"✔ Registered converter: {entry_point.name}")
# except Exception as e:
# print(f" Failed to load {entry_point.name}: {e}")
# print("Done")
# Register Document Intelligence converter at the top of the stack if endpoint is provided # Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None: if docintel_endpoint is not None:
self.register_page_converter( self.register_page_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint) DocumentIntelligenceConverter(endpoint=docintel_endpoint)
) )
# print("Discovering plugins")
# for entry_point in entry_points(group="markitdown.converters"):
# #print(entry_point)
# plugin = entry_point.load()
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs

View file

@ -2,8 +2,8 @@
import base64 import base64
import re import re
from typing import Any, Union from typing import Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult

View file

@ -11,12 +11,6 @@ from azure.identity import DefaultAzureCredential
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException,
UnsupportedFormatException,
)
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved. # This constant is a temporary fix until the bug is resolved.

View file

@ -1,9 +1,8 @@
from typing import Any, Dict, List, Optional, Union from typing import Union
import mammoth import mammoth
from ._base import ( from ._base import (
DocumentConverter,
DocumentConverterResult, DocumentConverterResult,
) )

View file

@ -1,7 +1,4 @@
import re
from typing import Any, Union from typing import Any, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult

View file

@ -1,5 +1,5 @@
from typing import Any, Dict, List, Optional, Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter

View file

@ -1,5 +1,5 @@
import json import json
from typing import Any, Dict, List, Optional, Union from typing import Any, Union
from ._base import ( from ._base import (
DocumentConverter, DocumentConverter,

View file

@ -1,8 +1,8 @@
import re import re
import markdownify import markdownify
from typing import Any, Union from typing import Any
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import quote, unquote, urlparse, urlunparse
class _CustomMarkdownify(markdownify.MarkdownConverter): class _CustomMarkdownify(markdownify.MarkdownConverter):

View file

@ -1,10 +1,9 @@
# type: ignore
import subprocess import subprocess
import shutil import shutil
import json import json
from warnings import warn from warnings import warn
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):

View file

@ -1,8 +1,8 @@
import tempfile import tempfile
from typing import Any, Dict, List, Optional, Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverterResult
from ._wav_converter import WavConverter from ._wav_converter import WavConverter
from warnings import warn, resetwarnings, catch_warnings from warnings import resetwarnings, catch_warnings
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False

View file

@ -1,4 +1,3 @@
# type: ignore
import pdfminer import pdfminer
import pdfminer.high_level import pdfminer.high_level
from typing import Union from typing import Union

View file

@ -1,4 +1,3 @@
# type: ignore
import base64 import base64
import pptx import pptx
import re import re
@ -6,7 +5,7 @@ import html
from typing import Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter

View file

@ -1,6 +1,5 @@
# type: ignore
from xml.dom import minidom from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify

View file

@ -1,25 +1,15 @@
from typing import Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
from warnings import warn, resetwarnings, catch_warnings
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
try: try:
# Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
class WavConverter(MediaConverter): class WavConverter(MediaConverter):

View file

@ -1,7 +1,6 @@
import re import re
from typing import Any, Union from typing import Any, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult

View file

@ -1,18 +1,11 @@
import re import re
from typing import Any, Union, Dict, List from typing import Any, Union, Dict, List
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify
from .._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException,
UnsupportedFormatException,
)
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:

View file

@ -5,13 +5,6 @@ from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException,
UnsupportedFormatException,
)
class ZipConverter(DocumentConverter): class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files. """Converts ZIP files to markdown by extracting and converting all contained files.