More converters.
This commit is contained in:
parent
7a6a08b3a1
commit
254946858c
4 changed files with 155 additions and 132 deletions
|
|
@ -53,6 +53,8 @@ from .converters import (
|
||||||
RssConverter,
|
RssConverter,
|
||||||
WikipediaConverter,
|
WikipediaConverter,
|
||||||
YouTubeConverter,
|
YouTubeConverter,
|
||||||
|
IpynbConverter,
|
||||||
|
BingSerpConverter,
|
||||||
)
|
)
|
||||||
from .converters._markdownify import _CustomMarkdownify
|
from .converters._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
@ -92,138 +94,6 @@ finally:
|
||||||
resetwarnings()
|
resetwarnings()
|
||||||
|
|
||||||
|
|
||||||
class IpynbConverter(DocumentConverter):
|
|
||||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not ipynb
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() != ".ipynb":
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Parse and convert the notebook
|
|
||||||
result = None
|
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
notebook_content = json.load(fh)
|
|
||||||
result = self._convert(notebook_content)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""Helper function that converts notebook JSON content to Markdown."""
|
|
||||||
try:
|
|
||||||
md_output = []
|
|
||||||
title = None
|
|
||||||
|
|
||||||
for cell in notebook_content.get("cells", []):
|
|
||||||
cell_type = cell.get("cell_type", "")
|
|
||||||
source_lines = cell.get("source", [])
|
|
||||||
|
|
||||||
if cell_type == "markdown":
|
|
||||||
md_output.append("".join(source_lines))
|
|
||||||
|
|
||||||
# Extract the first # heading as title if not already found
|
|
||||||
if title is None:
|
|
||||||
for line in source_lines:
|
|
||||||
if line.startswith("# "):
|
|
||||||
title = line.lstrip("# ").strip()
|
|
||||||
break
|
|
||||||
|
|
||||||
elif cell_type == "code":
|
|
||||||
# Code cells are wrapped in Markdown code blocks
|
|
||||||
md_output.append(f"```python\n{''.join(source_lines)}\n```")
|
|
||||||
elif cell_type == "raw":
|
|
||||||
md_output.append(f"```\n{''.join(source_lines)}\n```")
|
|
||||||
|
|
||||||
md_text = "\n\n".join(md_output)
|
|
||||||
|
|
||||||
# Check for title in notebook metadata
|
|
||||||
title = notebook_content.get("metadata", {}).get("title", title)
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=title,
|
|
||||||
text_content=md_text,
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise FileConversionException(
|
|
||||||
f"Error converting .ipynb file: {str(e)}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
|
|
||||||
class BingSerpConverter(DocumentConverter):
|
|
||||||
"""
|
|
||||||
Handle Bing results pages (only the organic search results).
|
|
||||||
NOTE: It is better to use the Bing API
|
|
||||||
"""
|
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not a Bing SERP
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
|
||||||
return None
|
|
||||||
url = kwargs.get("url", "")
|
|
||||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Parse the query parameters
|
|
||||||
parsed_params = parse_qs(urlparse(url).query)
|
|
||||||
query = parsed_params.get("q", [""])[0]
|
|
||||||
|
|
||||||
# Parse the file
|
|
||||||
soup = None
|
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
||||||
|
|
||||||
# Clean up some formatting
|
|
||||||
for tptt in soup.find_all(class_="tptt"):
|
|
||||||
if hasattr(tptt, "string") and tptt.string:
|
|
||||||
tptt.string += " "
|
|
||||||
for slug in soup.find_all(class_="algoSlug_icon"):
|
|
||||||
slug.extract()
|
|
||||||
|
|
||||||
# Parse the algorithmic results
|
|
||||||
_markdownify = _CustomMarkdownify()
|
|
||||||
results = list()
|
|
||||||
for result in soup.find_all(class_="b_algo"):
|
|
||||||
# Rewrite redirect urls
|
|
||||||
for a in result.find_all("a", href=True):
|
|
||||||
parsed_href = urlparse(a["href"])
|
|
||||||
qs = parse_qs(parsed_href.query)
|
|
||||||
|
|
||||||
# The destination is contained in the u parameter,
|
|
||||||
# but appears to be base64 encoded, with some prefix
|
|
||||||
if "u" in qs:
|
|
||||||
u = (
|
|
||||||
qs["u"][0][2:].strip() + "=="
|
|
||||||
) # Python 3 doesn't care about extra padding
|
|
||||||
|
|
||||||
try:
|
|
||||||
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
|
||||||
a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
except binascii.Error:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Convert to markdown
|
|
||||||
md_result = _markdownify.convert_soup(result).strip()
|
|
||||||
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
|
||||||
results.append("\n".join([line for line in lines if len(line) > 0]))
|
|
||||||
|
|
||||||
webpage_text = (
|
|
||||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
|
||||||
+ "\n\n".join(results)
|
|
||||||
)
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None if soup.title is None else soup.title.string,
|
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PdfConverter(DocumentConverter):
|
class PdfConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,8 @@ from ._html_converter import HtmlConverter
|
||||||
from ._rss_converter import RssConverter
|
from ._rss_converter import RssConverter
|
||||||
from ._wikipedia_converter import WikipediaConverter
|
from ._wikipedia_converter import WikipediaConverter
|
||||||
from ._youtube_converter import YouTubeConverter
|
from ._youtube_converter import YouTubeConverter
|
||||||
|
from ._ipynb_converter import IpynbConverter
|
||||||
|
from ._bing_serp_converter import BingSerpConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
|
|
@ -17,4 +19,6 @@ __all__ = [
|
||||||
"RssConverter",
|
"RssConverter",
|
||||||
"WikipediaConverter",
|
"WikipediaConverter",
|
||||||
"YouTubeConverter",
|
"YouTubeConverter",
|
||||||
|
"IpynbConverter",
|
||||||
|
"BingSerpConverter",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
81
src/markitdown/converters/_bing_serp_converter.py
Normal file
81
src/markitdown/converters/_bing_serp_converter.py
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
# type: ignore
|
||||||
|
import base64
|
||||||
|
import re
|
||||||
|
|
||||||
|
from typing import Any, Union
|
||||||
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
class BingSerpConverter(DocumentConverter):
|
||||||
|
"""
|
||||||
|
Handle Bing results pages (only the organic search results).
|
||||||
|
NOTE: It is better to use the Bing API
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not a Bing SERP
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() not in [".html", ".htm"]:
|
||||||
|
return None
|
||||||
|
url = kwargs.get("url", "")
|
||||||
|
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Parse the query parameters
|
||||||
|
parsed_params = parse_qs(urlparse(url).query)
|
||||||
|
query = parsed_params.get("q", [""])[0]
|
||||||
|
|
||||||
|
# Parse the file
|
||||||
|
soup = None
|
||||||
|
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||||
|
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||||
|
|
||||||
|
# Clean up some formatting
|
||||||
|
for tptt in soup.find_all(class_="tptt"):
|
||||||
|
if hasattr(tptt, "string") and tptt.string:
|
||||||
|
tptt.string += " "
|
||||||
|
for slug in soup.find_all(class_="algoSlug_icon"):
|
||||||
|
slug.extract()
|
||||||
|
|
||||||
|
# Parse the algorithmic results
|
||||||
|
_markdownify = _CustomMarkdownify()
|
||||||
|
results = list()
|
||||||
|
for result in soup.find_all(class_="b_algo"):
|
||||||
|
# Rewrite redirect urls
|
||||||
|
for a in result.find_all("a", href=True):
|
||||||
|
parsed_href = urlparse(a["href"])
|
||||||
|
qs = parse_qs(parsed_href.query)
|
||||||
|
|
||||||
|
# The destination is contained in the u parameter,
|
||||||
|
# but appears to be base64 encoded, with some prefix
|
||||||
|
if "u" in qs:
|
||||||
|
u = (
|
||||||
|
qs["u"][0][2:].strip() + "=="
|
||||||
|
) # Python 3 doesn't care about extra padding
|
||||||
|
|
||||||
|
try:
|
||||||
|
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
||||||
|
a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
except binascii.Error:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Convert to markdown
|
||||||
|
md_result = _markdownify.convert_soup(result).strip()
|
||||||
|
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
||||||
|
results.append("\n".join([line for line in lines if len(line) > 0]))
|
||||||
|
|
||||||
|
webpage_text = (
|
||||||
|
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||||
|
+ "\n\n".join(results)
|
||||||
|
)
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None if soup.title is None else soup.title.string,
|
||||||
|
text_content=webpage_text,
|
||||||
|
)
|
||||||
68
src/markitdown/converters/_ipynb_converter.py
Normal file
68
src/markitdown/converters/_ipynb_converter.py
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
import json
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
|
from ._base import (
|
||||||
|
DocumentConverter,
|
||||||
|
DocumentConverterResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class IpynbConverter(DocumentConverter):
|
||||||
|
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not ipynb
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() != ".ipynb":
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Parse and convert the notebook
|
||||||
|
result = None
|
||||||
|
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||||
|
notebook_content = json.load(fh)
|
||||||
|
result = self._convert(notebook_content)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
||||||
|
"""Helper function that converts notebook JSON content to Markdown."""
|
||||||
|
try:
|
||||||
|
md_output = []
|
||||||
|
title = None
|
||||||
|
|
||||||
|
for cell in notebook_content.get("cells", []):
|
||||||
|
cell_type = cell.get("cell_type", "")
|
||||||
|
source_lines = cell.get("source", [])
|
||||||
|
|
||||||
|
if cell_type == "markdown":
|
||||||
|
md_output.append("".join(source_lines))
|
||||||
|
|
||||||
|
# Extract the first # heading as title if not already found
|
||||||
|
if title is None:
|
||||||
|
for line in source_lines:
|
||||||
|
if line.startswith("# "):
|
||||||
|
title = line.lstrip("# ").strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
elif cell_type == "code":
|
||||||
|
# Code cells are wrapped in Markdown code blocks
|
||||||
|
md_output.append(f"```python\n{''.join(source_lines)}\n```")
|
||||||
|
elif cell_type == "raw":
|
||||||
|
md_output.append(f"```\n{''.join(source_lines)}\n```")
|
||||||
|
|
||||||
|
md_text = "\n\n".join(md_output)
|
||||||
|
|
||||||
|
# Check for title in notebook metadata
|
||||||
|
title = notebook_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=title,
|
||||||
|
text_content=md_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise FileConversionException(
|
||||||
|
f"Error converting .ipynb file: {str(e)}"
|
||||||
|
) from e
|
||||||
Loading…
Reference in a new issue