Significant cleanup and refactor.

This commit is contained in:
Adam Fourney 2025-02-09 20:42:58 -08:00
parent a795a16ce0
commit b40291d747
17 changed files with 24 additions and 113 deletions

View file

@ -1,49 +1,19 @@
# type: ignore
import base64
import binascii
import copy
import html
import json
import mimetypes
import os
import re
import shutil
import subprocess
import sys
import tempfile
import traceback
import zipfile
import importlib
import sys
from importlib.metadata import entry_points
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from typing import Any, List, Optional, Union
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
from urllib.parse import urlparse
from warnings import warn
import mammoth
import markdownify
import olefile
import pandas as pd
import pdfminer
import pdfminer.high_level
import pptx
# File-format detection
import puremagic
import requests
from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Azure imports
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
from .converters import (
DocumentConverter,
@ -67,11 +37,8 @@ from .converters import (
ZipConverter,
DocumentIntelligenceConverter,
)
from .converters._markdownify import _CustomMarkdownify
from ._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException,
UnsupportedFormatException,
)
@ -151,7 +118,6 @@ class MarkItDown:
self.register_page_converter(HtmlConverter())
self.register_page_converter(RssConverter())
self.register_page_converter(WikipediaConverter())
self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter())
self.register_page_converter(DocxConverter())
@ -165,33 +131,17 @@ class MarkItDown:
self.register_page_converter(PdfConverter())
self.register_page_converter(OutlookMsgConverter())
# print("Discovering plugins")
# for entry_point in entry_points(group="markitdown.converters"):
# args = {
# "required1": "Override1",
# "required2": "Override2",
# "required3": "Override3"
# }
#
# #print(entry_point)
# plugin = entry_point.load()
# instance = plugin(**args)
# print(instance)
# try:
# ConverterClass = entry_point.load()
# self.register_page_converter(ConverterClass())
# print(f"✔ Registered converter: {entry_point.name}")
# except Exception as e:
# print(f" Failed to load {entry_point.name}: {e}")
# print("Done")
# Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None:
self.register_page_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
)
# print("Discovering plugins")
# for entry_point in entry_points(group="markitdown.converters"):
# #print(entry_point)
# plugin = entry_point.load()
def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs

View file

@ -2,8 +2,8 @@
import base64
import re
from typing import Any, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from typing import Union
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult

View file

@ -11,12 +11,6 @@ from azure.identity import DefaultAzureCredential
from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException,
UnsupportedFormatException,
)
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved.

View file

@ -1,9 +1,8 @@
from typing import Any, Dict, List, Optional, Union
from typing import Union
import mammoth
from ._base import (
DocumentConverter,
DocumentConverterResult,
)

View file

@ -1,7 +1,4 @@
import re
from typing import Any, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult

View file

@ -1,5 +1,5 @@
from typing import Any, Dict, List, Optional, Union
from ._base import DocumentConverter, DocumentConverterResult
from typing import Union
from ._base import DocumentConverterResult
from ._media_converter import MediaConverter

View file

@ -1,5 +1,5 @@
import json
from typing import Any, Dict, List, Optional, Union
from typing import Any, Union
from ._base import (
DocumentConverter,

View file

@ -1,8 +1,8 @@
import re
import markdownify
from typing import Any, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from typing import Any
from urllib.parse import quote, unquote, urlparse, urlunparse
class _CustomMarkdownify(markdownify.MarkdownConverter):

View file

@ -1,10 +1,9 @@
# type: ignore
import subprocess
import shutil
import json
from warnings import warn
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
class MediaConverter(DocumentConverter):

View file

@ -1,8 +1,8 @@
import tempfile
from typing import Any, Dict, List, Optional, Union
from ._base import DocumentConverter, DocumentConverterResult
from typing import Union
from ._base import DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import warn, resetwarnings, catch_warnings
from warnings import resetwarnings, catch_warnings
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False

View file

@ -1,4 +1,3 @@
# type: ignore
import pdfminer
import pdfminer.high_level
from typing import Union

View file

@ -1,4 +1,3 @@
# type: ignore
import base64
import pptx
import re
@ -6,7 +5,7 @@ import html
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter

View file

@ -1,6 +1,5 @@
# type: ignore
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from typing import Union
from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify

View file

@ -1,25 +1,15 @@
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverterResult
from ._media_converter import MediaConverter
from warnings import warn, resetwarnings, catch_warnings
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
try:
# Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError:
pass
finally:
resetwarnings()
class WavConverter(MediaConverter):

View file

@ -1,7 +1,6 @@
import re
from typing import Any, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult

View file

@ -1,18 +1,11 @@
import re
from typing import Any, Union, Dict, List
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify
from .._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException,
UnsupportedFormatException,
)
# Optional YouTube transcription support
try:

View file

@ -5,13 +5,6 @@ from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FileConversionException,
UnsupportedFormatException,
)
class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.