More converters.
This commit is contained in:
parent
7a6a08b3a1
commit
254946858c
4 changed files with 155 additions and 132 deletions
|
|
@ -53,6 +53,8 @@ from .converters import (
|
|||
RssConverter,
|
||||
WikipediaConverter,
|
||||
YouTubeConverter,
|
||||
IpynbConverter,
|
||||
BingSerpConverter,
|
||||
)
|
||||
from .converters._markdownify import _CustomMarkdownify
|
||||
|
||||
|
|
@ -92,138 +94,6 @@ finally:
|
|||
resetwarnings()
|
||||
|
||||
|
||||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not ipynb
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".ipynb":
|
||||
return None
|
||||
|
||||
# Parse and convert the notebook
|
||||
result = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
notebook_content = json.load(fh)
|
||||
result = self._convert(notebook_content)
|
||||
|
||||
return result
|
||||
|
||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
||||
"""Helper function that converts notebook JSON content to Markdown."""
|
||||
try:
|
||||
md_output = []
|
||||
title = None
|
||||
|
||||
for cell in notebook_content.get("cells", []):
|
||||
cell_type = cell.get("cell_type", "")
|
||||
source_lines = cell.get("source", [])
|
||||
|
||||
if cell_type == "markdown":
|
||||
md_output.append("".join(source_lines))
|
||||
|
||||
# Extract the first # heading as title if not already found
|
||||
if title is None:
|
||||
for line in source_lines:
|
||||
if line.startswith("# "):
|
||||
title = line.lstrip("# ").strip()
|
||||
break
|
||||
|
||||
elif cell_type == "code":
|
||||
# Code cells are wrapped in Markdown code blocks
|
||||
md_output.append(f"```python\n{''.join(source_lines)}\n```")
|
||||
elif cell_type == "raw":
|
||||
md_output.append(f"```\n{''.join(source_lines)}\n```")
|
||||
|
||||
md_text = "\n\n".join(md_output)
|
||||
|
||||
# Check for title in notebook metadata
|
||||
title = notebook_content.get("metadata", {}).get("title", title)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=title,
|
||||
text_content=md_text,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Error converting .ipynb file: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
class BingSerpConverter(DocumentConverter):
|
||||
"""
|
||||
Handle Bing results pages (only the organic search results).
|
||||
NOTE: It is better to use the Bing API
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a Bing SERP
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".html", ".htm"]:
|
||||
return None
|
||||
url = kwargs.get("url", "")
|
||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||
return None
|
||||
|
||||
# Parse the query parameters
|
||||
parsed_params = parse_qs(urlparse(url).query)
|
||||
query = parsed_params.get("q", [""])[0]
|
||||
|
||||
# Parse the file
|
||||
soup = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||
|
||||
# Clean up some formatting
|
||||
for tptt in soup.find_all(class_="tptt"):
|
||||
if hasattr(tptt, "string") and tptt.string:
|
||||
tptt.string += " "
|
||||
for slug in soup.find_all(class_="algoSlug_icon"):
|
||||
slug.extract()
|
||||
|
||||
# Parse the algorithmic results
|
||||
_markdownify = _CustomMarkdownify()
|
||||
results = list()
|
||||
for result in soup.find_all(class_="b_algo"):
|
||||
# Rewrite redirect urls
|
||||
for a in result.find_all("a", href=True):
|
||||
parsed_href = urlparse(a["href"])
|
||||
qs = parse_qs(parsed_href.query)
|
||||
|
||||
# The destination is contained in the u parameter,
|
||||
# but appears to be base64 encoded, with some prefix
|
||||
if "u" in qs:
|
||||
u = (
|
||||
qs["u"][0][2:].strip() + "=="
|
||||
) # Python 3 doesn't care about extra padding
|
||||
|
||||
try:
|
||||
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
||||
a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
except binascii.Error:
|
||||
pass
|
||||
|
||||
# Convert to markdown
|
||||
md_result = _markdownify.convert_soup(result).strip()
|
||||
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
||||
results.append("\n".join([line for line in lines if len(line) > 0]))
|
||||
|
||||
webpage_text = (
|
||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||
+ "\n\n".join(results)
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
text_content=webpage_text,
|
||||
)
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
"""
|
||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ from ._html_converter import HtmlConverter
|
|||
from ._rss_converter import RssConverter
|
||||
from ._wikipedia_converter import WikipediaConverter
|
||||
from ._youtube_converter import YouTubeConverter
|
||||
from ._ipynb_converter import IpynbConverter
|
||||
from ._bing_serp_converter import BingSerpConverter
|
||||
|
||||
__all__ = [
|
||||
"DocumentConverter",
|
||||
|
|
@ -17,4 +19,6 @@ __all__ = [
|
|||
"RssConverter",
|
||||
"WikipediaConverter",
|
||||
"YouTubeConverter",
|
||||
"IpynbConverter",
|
||||
"BingSerpConverter",
|
||||
]
|
||||
|
|
|
|||
81
src/markitdown/converters/_bing_serp_converter.py
Normal file
81
src/markitdown/converters/_bing_serp_converter.py
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
# type: ignore
|
||||
import base64
|
||||
import re
|
||||
|
||||
from typing import Any, Union
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
|
||||
class BingSerpConverter(DocumentConverter):
|
||||
"""
|
||||
Handle Bing results pages (only the organic search results).
|
||||
NOTE: It is better to use the Bing API
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a Bing SERP
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".html", ".htm"]:
|
||||
return None
|
||||
url = kwargs.get("url", "")
|
||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||
return None
|
||||
|
||||
# Parse the query parameters
|
||||
parsed_params = parse_qs(urlparse(url).query)
|
||||
query = parsed_params.get("q", [""])[0]
|
||||
|
||||
# Parse the file
|
||||
soup = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
||||
|
||||
# Clean up some formatting
|
||||
for tptt in soup.find_all(class_="tptt"):
|
||||
if hasattr(tptt, "string") and tptt.string:
|
||||
tptt.string += " "
|
||||
for slug in soup.find_all(class_="algoSlug_icon"):
|
||||
slug.extract()
|
||||
|
||||
# Parse the algorithmic results
|
||||
_markdownify = _CustomMarkdownify()
|
||||
results = list()
|
||||
for result in soup.find_all(class_="b_algo"):
|
||||
# Rewrite redirect urls
|
||||
for a in result.find_all("a", href=True):
|
||||
parsed_href = urlparse(a["href"])
|
||||
qs = parse_qs(parsed_href.query)
|
||||
|
||||
# The destination is contained in the u parameter,
|
||||
# but appears to be base64 encoded, with some prefix
|
||||
if "u" in qs:
|
||||
u = (
|
||||
qs["u"][0][2:].strip() + "=="
|
||||
) # Python 3 doesn't care about extra padding
|
||||
|
||||
try:
|
||||
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
||||
a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
except binascii.Error:
|
||||
pass
|
||||
|
||||
# Convert to markdown
|
||||
md_result = _markdownify.convert_soup(result).strip()
|
||||
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
||||
results.append("\n".join([line for line in lines if len(line) > 0]))
|
||||
|
||||
webpage_text = (
|
||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||
+ "\n\n".join(results)
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
text_content=webpage_text,
|
||||
)
|
||||
68
src/markitdown/converters/_ipynb_converter.py
Normal file
68
src/markitdown/converters/_ipynb_converter.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
import json
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from ._base import (
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
)
|
||||
|
||||
|
||||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not ipynb
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".ipynb":
|
||||
return None
|
||||
|
||||
# Parse and convert the notebook
|
||||
result = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
notebook_content = json.load(fh)
|
||||
result = self._convert(notebook_content)
|
||||
|
||||
return result
|
||||
|
||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
||||
"""Helper function that converts notebook JSON content to Markdown."""
|
||||
try:
|
||||
md_output = []
|
||||
title = None
|
||||
|
||||
for cell in notebook_content.get("cells", []):
|
||||
cell_type = cell.get("cell_type", "")
|
||||
source_lines = cell.get("source", [])
|
||||
|
||||
if cell_type == "markdown":
|
||||
md_output.append("".join(source_lines))
|
||||
|
||||
# Extract the first # heading as title if not already found
|
||||
if title is None:
|
||||
for line in source_lines:
|
||||
if line.startswith("# "):
|
||||
title = line.lstrip("# ").strip()
|
||||
break
|
||||
|
||||
elif cell_type == "code":
|
||||
# Code cells are wrapped in Markdown code blocks
|
||||
md_output.append(f"```python\n{''.join(source_lines)}\n```")
|
||||
elif cell_type == "raw":
|
||||
md_output.append(f"```\n{''.join(source_lines)}\n```")
|
||||
|
||||
md_text = "\n\n".join(md_output)
|
||||
|
||||
# Check for title in notebook metadata
|
||||
title = notebook_content.get("metadata", {}).get("title", title)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=title,
|
||||
text_content=md_text,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Error converting .ipynb file: {str(e)}"
|
||||
) from e
|
||||
Loading…
Reference in a new issue