feat: Add comprehensive XML support with structured Markdown conversion
This commit is contained in:
parent
bfde857420
commit
b89f51acdc
2 changed files with 114 additions and 1 deletions
27
README.md
27
README.md
|
|
@ -15,6 +15,8 @@ It supports:
|
||||||
- Audio (EXIF metadata and speech transcription)
|
- Audio (EXIF metadata and speech transcription)
|
||||||
- HTML
|
- HTML
|
||||||
- Text-based formats (CSV, JSON, XML)
|
- Text-based formats (CSV, JSON, XML)
|
||||||
|
- XML support includes general XML files, RSS feeds, and Atom feeds
|
||||||
|
- Preserves XML structure and attributes in Markdown format
|
||||||
- ZIP files (iterates over contents)
|
- ZIP files (iterates over contents)
|
||||||
|
|
||||||
To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
|
To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
|
||||||
|
|
@ -51,6 +53,31 @@ result = md.convert("test.xlsx")
|
||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### XML Support
|
||||||
|
|
||||||
|
The library provides comprehensive XML support:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
|
md = MarkItDown()
|
||||||
|
|
||||||
|
# General XML files
|
||||||
|
result = md.convert("data.xml")
|
||||||
|
|
||||||
|
# RSS feeds
|
||||||
|
result = md.convert("feed.rss")
|
||||||
|
|
||||||
|
# Atom feeds
|
||||||
|
result = md.convert("feed.atom")
|
||||||
|
```
|
||||||
|
|
||||||
|
XML files are converted to a structured Markdown format that preserves:
|
||||||
|
- XML element hierarchy using Markdown headers
|
||||||
|
- Element attributes as lists
|
||||||
|
- Text content
|
||||||
|
- Special handling for RSS and Atom feeds with proper formatting
|
||||||
|
|
||||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
||||||
|
|
@ -1399,6 +1399,91 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class XMLConverter(DocumentConverter):
|
||||||
|
"""Convert general XML files to markdown.
|
||||||
|
|
||||||
|
This converter handles general XML files and converts them to a structured Markdown format.
|
||||||
|
It preserves the XML hierarchy, attributes, and text content. For RSS and Atom feeds,
|
||||||
|
it delegates to the specialized RSSConverter.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Converts XML element hierarchy to Markdown headers
|
||||||
|
- Preserves element attributes as lists
|
||||||
|
- Maintains text content
|
||||||
|
- Automatically detects and delegates RSS/Atom feeds
|
||||||
|
- Provides clear error messages for invalid XML
|
||||||
|
|
||||||
|
Supported file extensions:
|
||||||
|
- .xml: General XML files
|
||||||
|
- .docbook: DocBook XML files
|
||||||
|
- .qtl: QTL files
|
||||||
|
- .rng: RELAX NG files
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not XML type
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() not in [".xml", ".docbook", ".qtl", ".rng"]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
doc = minidom.parse(local_path)
|
||||||
|
|
||||||
|
# Check if it's an RSS or Atom feed - if so, let RSSConverter handle it
|
||||||
|
if (doc.getElementsByTagName("rss") or
|
||||||
|
(doc.getElementsByTagName("feed") and doc.getElementsByTagName("entry"))):
|
||||||
|
return None
|
||||||
|
|
||||||
|
md_content = self._convert_xml_to_markdown(doc)
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=md_content
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# Provide more detailed error information
|
||||||
|
error_msg = f"XML dönüştürme hatası: {str(e)}\n"
|
||||||
|
error_msg += "Lütfen dosyanın geçerli bir XML dosyası olduğunu kontrol edin."
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=error_msg
|
||||||
|
)
|
||||||
|
|
||||||
|
def _convert_xml_to_markdown(self, doc: minidom.Document) -> str:
|
||||||
|
"""Convert XML document to markdown format"""
|
||||||
|
md_content = ""
|
||||||
|
|
||||||
|
# Get root element
|
||||||
|
root = doc.documentElement
|
||||||
|
md_content += f"# {root.tagName}\n\n"
|
||||||
|
|
||||||
|
# Convert child nodes
|
||||||
|
md_content += self._process_node(root, level=1)
|
||||||
|
|
||||||
|
return md_content.strip()
|
||||||
|
|
||||||
|
def _process_node(self, node: minidom.Element, level: int = 0) -> str:
|
||||||
|
"""Process an XML node and its children recursively"""
|
||||||
|
content = ""
|
||||||
|
|
||||||
|
# Process attributes
|
||||||
|
if node.attributes and node.attributes.length > 0:
|
||||||
|
content += "**Attributes:**\n\n"
|
||||||
|
for attr in node.attributes.items():
|
||||||
|
content += f"- {attr[0]}: {attr[1]}\n"
|
||||||
|
content += "\n"
|
||||||
|
|
||||||
|
# Process child nodes
|
||||||
|
for child in node.childNodes:
|
||||||
|
if child.nodeType == minidom.Node.TEXT_NODE:
|
||||||
|
text = child.data.strip()
|
||||||
|
if text:
|
||||||
|
content += f"{text}\n"
|
||||||
|
elif child.nodeType == minidom.Node.ELEMENT_NODE:
|
||||||
|
content += f"{'#' * (level + 2)} {child.tagName}\n\n"
|
||||||
|
content += self._process_node(child, level + 1)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
class FileConversionException(BaseException):
|
class FileConversionException(BaseException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
@ -1472,7 +1557,8 @@ class MarkItDown:
|
||||||
# To this end, the most specific converters should appear below the most generic converters
|
# To this end, the most specific converters should appear below the most generic converters
|
||||||
self.register_page_converter(PlainTextConverter())
|
self.register_page_converter(PlainTextConverter())
|
||||||
self.register_page_converter(HtmlConverter())
|
self.register_page_converter(HtmlConverter())
|
||||||
self.register_page_converter(RSSConverter())
|
self.register_page_converter(XMLConverter()) # Generic XML converter
|
||||||
|
self.register_page_converter(RSSConverter()) # Specific XML type
|
||||||
self.register_page_converter(WikipediaConverter())
|
self.register_page_converter(WikipediaConverter())
|
||||||
self.register_page_converter(YouTubeConverter())
|
self.register_page_converter(YouTubeConverter())
|
||||||
self.register_page_converter(BingSerpConverter())
|
self.register_page_converter(BingSerpConverter())
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue