diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index a224d1b..a8ead66 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -55,6 +55,12 @@ from .converters import ( YouTubeConverter, IpynbConverter, BingSerpConverter, + PdfConverter, + DocxConverter, + XlsxConverter, + XlsConverter, + PptxConverter, + ImageConverter, ) from .converters._markdownify import _CustomMarkdownify @@ -94,264 +100,6 @@ finally: resetwarnings() -class PdfConverter(DocumentConverter): - """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. - """ - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a PDF - extension = kwargs.get("file_extension", "") - if extension.lower() != ".pdf": - return None - - return DocumentConverterResult( - title=None, - text_content=pdfminer.high_level.extract_text(local_path), - ) - - -class DocxConverter(HtmlConverter): - """ - Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. - """ - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a DOCX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".docx": - return None - - result = None - with open(local_path, "rb") as docx_file: - style_map = kwargs.get("style_map", None) - - result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = result.value - result = self._convert(html_content) - - return result - - -class XlsxConverter(HtmlConverter): - """ - Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. - """ - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLSX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".xlsx": - return None - - sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") - md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += self._convert(html_content).text_content.strip() + "\n\n" - - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) - - -class XlsConverter(HtmlConverter): - """ - Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. - """ - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLS - extension = kwargs.get("file_extension", "") - if extension.lower() != ".xls": - return None - - sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") - md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += self._convert(html_content).text_content.strip() + "\n\n" - - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) - - -class PptxConverter(HtmlConverter): - """ - Converts PPTX files to Markdown. Supports heading, tables and images with alt text. - """ - - def _get_llm_description( - self, llm_client, llm_model, image_blob, content_type, prompt=None - ): - if prompt is None or prompt.strip() == "": - prompt = "Write a detailed alt text for this image with less than 50 words." - - image_base64 = base64.b64encode(image_blob).decode("utf-8") - data_uri = f"data:{content_type};base64,{image_base64}" - - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": data_uri, - }, - }, - {"type": "text", "text": prompt}, - ], - } - ] - - response = llm_client.chat.completions.create( - model=llm_model, messages=messages - ) - return response.choices[0].message.content - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a PPTX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".pptx": - return None - - md_content = "" - - presentation = pptx.Presentation(local_path) - slide_num = 0 - for slide in presentation.slides: - slide_num += 1 - - md_content += f"\n\n\n" - - title = slide.shapes.title - for shape in slide.shapes: - # Pictures - if self._is_picture(shape): - # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 - - llm_description = None - alt_text = None - - llm_client = kwargs.get("llm_client") - llm_model = kwargs.get("llm_model") - if llm_client is not None and llm_model is not None: - try: - llm_description = self._get_llm_description( - llm_client, - llm_model, - shape.image.blob, - shape.image.content_type, - ) - except Exception: - # Unable to describe with LLM - pass - - if not llm_description: - try: - alt_text = shape._element._nvXxPr.cNvPr.attrib.get( - "descr", "" - ) - except Exception: - # Unable to get alt text - pass - - # A placeholder name - filename = re.sub(r"\W", "", shape.name) + ".jpg" - md_content += ( - "\n![" - + (llm_description or alt_text or shape.name) - + "](" - + filename - + ")\n" - ) - - # Tables - if self._is_table(shape): - html_table = "" - first_row = True - for row in shape.table.rows: - html_table += "" - for cell in row.cells: - if first_row: - html_table += "" - else: - html_table += "" - html_table += "" - first_row = False - html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" - md_content += ( - "\n" + self._convert(html_table).text_content.strip() + "\n" - ) - - # Charts - if shape.has_chart: - md_content += self._convert_chart_to_markdown(shape.chart) - - # Text areas - elif shape.has_text_frame: - if shape == title: - md_content += "# " + shape.text.lstrip() + "\n" - else: - md_content += shape.text + "\n" - - md_content = md_content.strip() - - if slide.has_notes_slide: - md_content += "\n\n### Notes:\n" - notes_frame = slide.notes_slide.notes_text_frame - if notes_frame is not None: - md_content += notes_frame.text - md_content = md_content.strip() - - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) - - def _is_picture(self, shape): - if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: - return True - if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: - if hasattr(shape, "image"): - return True - return False - - def _is_table(self, shape): - if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: - return True - return False - - def _convert_chart_to_markdown(self, chart): - md = "\n\n### Chart" - if chart.has_title: - md += f": {chart.chart_title.text_frame.text}" - md += "\n\n" - data = [] - category_names = [c.label for c in chart.plots[0].categories] - series_names = [s.name for s in chart.series] - data.append(["Category"] + series_names) - - for idx, category in enumerate(category_names): - row = [category] - for series in chart.series: - row.append(series.values[idx]) - data.append(row) - - markdown_table = [] - for row in data: - markdown_table.append("| " + " | ".join(map(str, row)) + " |") - header = markdown_table[0] - separator = "|" + "|".join(["---"] * len(data[0])) + "|" - return md + "\n".join([header, separator] + markdown_table[1:]) - - class MediaConverter(DocumentConverter): """ Abstract class for multi-modal media (e.g., images and audio) @@ -498,89 +246,6 @@ class Mp3Converter(WavConverter): ) -class ImageConverter(MediaConverter): - """ - Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). - """ - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not an image - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".jpg", ".jpeg", ".png"]: - return None - - md_content = "" - - # Add metadata - metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) - if metadata: - for f in [ - "ImageSize", - "Title", - "Caption", - "Description", - "Keywords", - "Artist", - "Author", - "DateTimeOriginal", - "CreateDate", - "GPSPosition", - ]: - if f in metadata: - md_content += f"{f}: {metadata[f]}\n" - - # Try describing the image with GPTV - llm_client = kwargs.get("llm_client") - llm_model = kwargs.get("llm_model") - if llm_client is not None and llm_model is not None: - md_content += ( - "\n# Description:\n" - + self._get_llm_description( - local_path, - extension, - llm_client, - llm_model, - prompt=kwargs.get("llm_prompt"), - ).strip() - + "\n" - ) - - return DocumentConverterResult( - title=None, - text_content=md_content, - ) - - def _get_llm_description(self, local_path, extension, client, model, prompt=None): - if prompt is None or prompt.strip() == "": - prompt = "Write a detailed caption for this image." - - data_uri = "" - with open(local_path, "rb") as image_file: - content_type, encoding = mimetypes.guess_type("_dummy" + extension) - if content_type is None: - content_type = "image/jpeg" - image_base64 = base64.b64encode(image_file.read()).decode("utf-8") - data_uri = f"data:{content_type};base64,{image_base64}" - - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url": { - "url": data_uri, - }, - }, - ], - } - ] - - response = client.chat.completions.create(model=model, messages=messages) - return response.choices[0].message.content - - class OutlookMsgConverter(DocumentConverter): """Converts Outlook .msg files to markdown by extracting email metadata and content. @@ -915,6 +580,7 @@ class MarkItDown: # Later registrations are tried first / take higher priority than earlier registrations # To this end, the most specific converters should appear below the most generic converters self.register_page_converter(PlainTextConverter()) + self.register_page_converter(ZipConverter()) self.register_page_converter(HtmlConverter()) self.register_page_converter(RssConverter()) self.register_page_converter(WikipediaConverter()) @@ -930,7 +596,6 @@ class MarkItDown: self.register_page_converter(ImageConverter()) self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) - self.register_page_converter(ZipConverter()) self.register_page_converter(OutlookMsgConverter()) # print("Discovering plugins") diff --git a/src/markitdown/converters/__init__.py b/src/markitdown/converters/__init__.py index f83f224..b3a5cf0 100644 --- a/src/markitdown/converters/__init__.py +++ b/src/markitdown/converters/__init__.py @@ -10,6 +10,11 @@ from ._wikipedia_converter import WikipediaConverter from ._youtube_converter import YouTubeConverter from ._ipynb_converter import IpynbConverter from ._bing_serp_converter import BingSerpConverter +from ._pdf_converter import PdfConverter +from ._docx_converter import DocxConverter +from ._xlsx_converter import XlsxConverter, XlsConverter +from ._pptx_converter import PptxConverter +from ._image_converter import ImageConverter __all__ = [ "DocumentConverter", @@ -21,4 +26,10 @@ __all__ = [ "YouTubeConverter", "IpynbConverter", "BingSerpConverter", + "PdfConverter", + "DocxConverter", + "XlsxConverter", + "XlsConverter", + "PptxConverter", + "ImageConverter", ] diff --git a/src/markitdown/converters/_docx_converter.py b/src/markitdown/converters/_docx_converter.py new file mode 100644 index 0000000..766d1cb --- /dev/null +++ b/src/markitdown/converters/_docx_converter.py @@ -0,0 +1,32 @@ +from typing import Any, Dict, List, Optional, Union + +import mammoth + +from ._base import ( + DocumentConverter, + DocumentConverterResult, +) + +from ._html_converter import HtmlConverter + + +class DocxConverter(HtmlConverter): + """ + Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a DOCX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".docx": + return None + + result = None + with open(local_path, "rb") as docx_file: + style_map = kwargs.get("style_map", None) + + result = mammoth.convert_to_html(docx_file, style_map=style_map) + html_content = result.value + result = self._convert(html_content) + + return result diff --git a/src/markitdown/converters/_image_converter.py b/src/markitdown/converters/_image_converter.py new file mode 100644 index 0000000..ca3a91d --- /dev/null +++ b/src/markitdown/converters/_image_converter.py @@ -0,0 +1,87 @@ +from typing import Any, Dict, List, Optional, Union +from ._base import DocumentConverter, DocumentConverterResult +from ._media_converter import MediaConverter + + +class ImageConverter(MediaConverter): + """ + Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not an image + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".jpg", ".jpeg", ".png"]: + return None + + md_content = "" + + # Add metadata + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) + + if metadata: + for f in [ + "ImageSize", + "Title", + "Caption", + "Description", + "Keywords", + "Artist", + "Author", + "DateTimeOriginal", + "CreateDate", + "GPSPosition", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Try describing the image with GPTV + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: + md_content += ( + "\n# Description:\n" + + self._get_llm_description( + local_path, + extension, + llm_client, + llm_model, + prompt=kwargs.get("llm_prompt"), + ).strip() + + "\n" + ) + + return DocumentConverterResult( + title=None, + text_content=md_content, + ) + + def _get_llm_description(self, local_path, extension, client, model, prompt=None): + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." + + data_uri = "" + with open(local_path, "rb") as image_file: + content_type, encoding = mimetypes.guess_type("_dummy" + extension) + if content_type is None: + content_type = "image/jpeg" + image_base64 = base64.b64encode(image_file.read()).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content diff --git a/src/markitdown/converters/_media_converter.py b/src/markitdown/converters/_media_converter.py new file mode 100644 index 0000000..55dc038 --- /dev/null +++ b/src/markitdown/converters/_media_converter.py @@ -0,0 +1,37 @@ +# type: ignore +import subprocess +import shutil +import json +from warnings import warn + +from ._base import DocumentConverter, DocumentConverterResult + + +class MediaConverter(DocumentConverter): + """ + Abstract class for multi-modal media (e.g., images and audio) + """ + + def _get_metadata(self, local_path, exiftool_path=None): + if not exiftool_path: + which_exiftool = shutil.which("exiftool") + if which_exiftool: + warn( + f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., + + md = MarkItDown(exiftool_path="{which_exiftool}") + +This warning will be removed in future releases. +""", + DeprecationWarning, + ) + + return None + else: + try: + result = subprocess.run( + [exiftool_path, "-json", local_path], capture_output=True, text=True + ).stdout + return json.loads(result)[0] + except Exception: + return None diff --git a/src/markitdown/converters/_pdf_converter.py b/src/markitdown/converters/_pdf_converter.py new file mode 100644 index 0000000..8a399db --- /dev/null +++ b/src/markitdown/converters/_pdf_converter.py @@ -0,0 +1,22 @@ +# type: ignore +import pdfminer +import pdfminer.high_level +from typing import Union +from ._base import DocumentConverter, DocumentConverterResult + + +class PdfConverter(DocumentConverter): + """ + Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a PDF + extension = kwargs.get("file_extension", "") + if extension.lower() != ".pdf": + return None + + return DocumentConverterResult( + title=None, + text_content=pdfminer.high_level.extract_text(local_path), + ) diff --git a/src/markitdown/converters/_pptx_converter.py b/src/markitdown/converters/_pptx_converter.py new file mode 100644 index 0000000..abb4ed9 --- /dev/null +++ b/src/markitdown/converters/_pptx_converter.py @@ -0,0 +1,181 @@ +# type: ignore +import base64 +import pptx +import re +import html + +from typing import Union + +from ._base import DocumentConverter, DocumentConverterResult +from ._html_converter import HtmlConverter + + +class PptxConverter(HtmlConverter): + """ + Converts PPTX files to Markdown. Supports heading, tables and images with alt text. + """ + + def _get_llm_description( + self, llm_client, llm_model, image_blob, content_type, prompt=None + ): + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed alt text for this image with less than 50 words." + + image_base64 = base64.b64encode(image_blob).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + {"type": "text", "text": prompt}, + ], + } + ] + + response = llm_client.chat.completions.create( + model=llm_model, messages=messages + ) + return response.choices[0].message.content + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a PPTX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".pptx": + return None + + md_content = "" + + presentation = pptx.Presentation(local_path) + slide_num = 0 + for slide in presentation.slides: + slide_num += 1 + + md_content += f"\n\n\n" + + title = slide.shapes.title + for shape in slide.shapes: + # Pictures + if self._is_picture(shape): + # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 + + llm_description = None + alt_text = None + + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: + try: + llm_description = self._get_llm_description( + llm_client, + llm_model, + shape.image.blob, + shape.image.content_type, + ) + except Exception: + # Unable to describe with LLM + pass + + if not llm_description: + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get( + "descr", "" + ) + except Exception: + # Unable to get alt text + pass + + # A placeholder name + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += ( + "\n![" + + (llm_description or alt_text or shape.name) + + "](" + + filename + + ")\n" + ) + + # Tables + if self._is_table(shape): + html_table = "" + first_row = True + for row in shape.table.rows: + html_table += "" + for cell in row.cells: + if first_row: + html_table += "" + else: + html_table += "" + html_table += "" + first_row = False + html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" + md_content += ( + "\n" + self._convert(html_table).text_content.strip() + "\n" + ) + + # Charts + if shape.has_chart: + md_content += self._convert_chart_to_markdown(shape.chart) + + # Text areas + elif shape.has_text_frame: + if shape == title: + md_content += "# " + shape.text.lstrip() + "\n" + else: + md_content += shape.text + "\n" + + md_content = md_content.strip() + + if slide.has_notes_slide: + md_content += "\n\n### Notes:\n" + notes_frame = slide.notes_slide.notes_text_frame + if notes_frame is not None: + md_content += notes_frame.text + md_content = md_content.strip() + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + def _is_picture(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: + return True + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: + if hasattr(shape, "image"): + return True + return False + + def _is_table(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: + return True + return False + + def _convert_chart_to_markdown(self, chart): + md = "\n\n### Chart" + if chart.has_title: + md += f": {chart.chart_title.text_frame.text}" + md += "\n\n" + data = [] + category_names = [c.label for c in chart.plots[0].categories] + series_names = [s.name for s in chart.series] + data.append(["Category"] + series_names) + + for idx, category in enumerate(category_names): + row = [category] + for series in chart.series: + row.append(series.values[idx]) + data.append(row) + + markdown_table = [] + for row in data: + markdown_table.append("| " + " | ".join(map(str, row)) + " |") + header = markdown_table[0] + separator = "|" + "|".join(["---"] * len(data[0])) + "|" + return md + "\n".join([header, separator] + markdown_table[1:]) diff --git a/src/markitdown/converters/_xlsx_converter.py b/src/markitdown/converters/_xlsx_converter.py new file mode 100644 index 0000000..c7c19ce --- /dev/null +++ b/src/markitdown/converters/_xlsx_converter.py @@ -0,0 +1,54 @@ +from typing import Any, Dict, List, Optional, Union + +import pandas as pd + +from ._base import DocumentConverter, DocumentConverterResult +from ._html_converter import HtmlConverter + + +class XlsxConverter(HtmlConverter): + """ + Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLSX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".xlsx": + return None + + sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += self._convert(html_content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class XlsConverter(HtmlConverter): + """ + Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLS + extension = kwargs.get("file_extension", "") + if extension.lower() != ".xls": + return None + + sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += self._convert(html_content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + )