fix: improve metadata and description extraction logic

This commit is contained in:
Nima 2025-02-18 19:28:31 +01:00
parent 8f76393ad8
commit 8363f419ab

View file

@ -48,29 +48,31 @@ class YouTubeConverter(DocumentConverter):
return None return None
# Read the meta tags # Read the meta tags
assert soup.title is not None and soup.title.string is not None
metadata: Dict[str, str] = {"title": soup.title.string} metadata: Dict[str, str] = {"title": soup.title.string}
for meta in soup(["meta"]): for meta in soup(["meta"]):
for a in meta.attrs: for a in meta.attrs:
if a in ["itemprop", "property", "name"]: if a in ["itemprop", "property", "name"]:
metadata[meta[a]] = meta.get("content", "") content = meta.get("content", "")
if content: # Only add non-empty content
metadata[meta[a]] = content
break break
# We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
try: try:
for script in soup(["script"]): for script in soup(["script"]):
content = script.text if not script.string: # Skip empty scripts
continue
content = script.string
if "ytInitialData" in content: if "ytInitialData" in content:
lines = re.split(r"\r?\n", content) match = re.search(r"var ytInitialData = ({.*?});", content)
obj_start = lines[0].find("{") if match:
obj_end = lines[0].rfind("}") data = json.loads(match.group(1))
if obj_start >= 0 and obj_end >= 0: attrdesc = self._findKey(data, "attributedDescriptionBodyText")
data = json.loads(lines[0][obj_start : obj_end + 1]) if attrdesc and isinstance(attrdesc, dict):
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore metadata["description"] = str(attrdesc.get("content", ""))
if attrdesc:
metadata["description"] = str(attrdesc["content"])
break break
except Exception: except Exception as e:
print(f"Error extracting description: {e}")
pass pass
# Start preparing the page # Start preparing the page