feat: Add comprehensive XML support with structured Markdown conversion

2025-01-31 03:40:03 +03:00 · 2025-01-31 03:40:03 +03:00 · b89f51acdc
commit b89f51acdc
parent bfde857420
2 changed files with 114 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -15,6 +15,8 @@ It supports:
 - Audio (EXIF metadata and speech transcription)
 - HTML
 - Text-based formats (CSV, JSON, XML)
  - XML support includes general XML files, RSS feeds, and Atom feeds
  - Preserves XML structure and attributes in Markdown format
 - ZIP files (iterates over contents)
 To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
@ -51,6 +53,31 @@ result = md.convert("test.xlsx")
 print(result.text_content)
 ```
 #### XML Support
 The library provides comprehensive XML support:
 ```python
 from markitdown import MarkItDown
 md = MarkItDown()
 # General XML files
 result = md.convert("data.xml")
 # RSS feeds
 result = md.convert("feed.rss")
 # Atom feeds
 result = md.convert("feed.atom")
 ```
 XML files are converted to a structured Markdown format that preserves:
 - XML element hierarchy using Markdown headers
 - Element attributes as lists
 - Text content
 - Special handling for RSS and Atom feeds with proper formatting
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
 ```python
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -1399,6 +1399,91 @@ class DocumentIntelligenceConverter(DocumentConverter):
        )
 class XMLConverter(DocumentConverter):
    """Convert general XML files to markdown.
    This converter handles general XML files and converts them to a structured Markdown format.
    It preserves the XML hierarchy, attributes, and text content. For RSS and Atom feeds,
    it delegates to the specialized RSSConverter.
    Features:
    - Converts XML element hierarchy to Markdown headers
    - Preserves element attributes as lists
    - Maintains text content
    - Automatically detects and delegates RSS/Atom feeds
    - Provides clear error messages for invalid XML
    Supported file extensions:
    - .xml: General XML files
    - .docbook: DocBook XML files
    - .qtl: QTL files
    - .rng: RELAX NG files
    """
    def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not XML type
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".xml", ".docbook", ".qtl", ".rng"]:
            return None
        try:
            doc = minidom.parse(local_path)
            # Check if it's an RSS or Atom feed - if so, let RSSConverter handle it
            if (doc.getElementsByTagName("rss") or 
                (doc.getElementsByTagName("feed") and doc.getElementsByTagName("entry"))):
                return None
            md_content = self._convert_xml_to_markdown(doc)
            return DocumentConverterResult(
                title=None,
                text_content=md_content
            )
        except Exception as e:
            # Provide more detailed error information
            error_msg = f"XML dönüştürme hatası: {str(e)}\n"
            error_msg += "Lütfen dosyanın geçerli bir XML dosyası olduğunu kontrol edin."
            return DocumentConverterResult(
                title=None,
                text_content=error_msg
            )
    def _convert_xml_to_markdown(self, doc: minidom.Document) -> str:
        """Convert XML document to markdown format"""
        md_content = ""
        # Get root element
        root = doc.documentElement
        md_content += f"# {root.tagName}\n\n"
        # Convert child nodes
        md_content += self._process_node(root, level=1)
        return md_content.strip()
    def _process_node(self, node: minidom.Element, level: int = 0) -> str:
        """Process an XML node and its children recursively"""
        content = ""
        # Process attributes
        if node.attributes and node.attributes.length > 0:
            content += "**Attributes:**\n\n"
            for attr in node.attributes.items():
                content += f"- {attr[0]}: {attr[1]}\n"
            content += "\n"
        # Process child nodes
        for child in node.childNodes:
            if child.nodeType == minidom.Node.TEXT_NODE:
                text = child.data.strip()
                if text:
                    content += f"{text}\n"
            elif child.nodeType == minidom.Node.ELEMENT_NODE:
                content += f"{'#' * (level + 2)} {child.tagName}\n\n"
                content += self._process_node(child, level + 1)
        return content
 class FileConversionException(BaseException):
    pass
@ -1472,7 +1557,8 @@ class MarkItDown:
        # To this end, the most specific converters should appear below the most generic converters
        self.register_page_converter(PlainTextConverter())
        self.register_page_converter(HtmlConverter())
-        self.register_page_converter(RSSConverter())
+        self.register_page_converter(XMLConverter())  # Generic XML converter
        self.register_page_converter(RSSConverter())  # Specific XML type
        self.register_page_converter(WikipediaConverter())
        self.register_page_converter(YouTubeConverter())
        self.register_page_converter(BingSerpConverter())