fix: improve metadata and description extraction logic
This commit is contained in:
parent
8f76393ad8
commit
8363f419ab
1 changed files with 14 additions and 12 deletions
|
|
@ -48,29 +48,31 @@ class YouTubeConverter(DocumentConverter):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Read the meta tags
|
# Read the meta tags
|
||||||
assert soup.title is not None and soup.title.string is not None
|
|
||||||
metadata: Dict[str, str] = {"title": soup.title.string}
|
metadata: Dict[str, str] = {"title": soup.title.string}
|
||||||
for meta in soup(["meta"]):
|
for meta in soup(["meta"]):
|
||||||
for a in meta.attrs:
|
for a in meta.attrs:
|
||||||
if a in ["itemprop", "property", "name"]:
|
if a in ["itemprop", "property", "name"]:
|
||||||
metadata[meta[a]] = meta.get("content", "")
|
content = meta.get("content", "")
|
||||||
|
if content: # Only add non-empty content
|
||||||
|
metadata[meta[a]] = content
|
||||||
break
|
break
|
||||||
|
|
||||||
# We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
|
# We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
|
||||||
try:
|
try:
|
||||||
for script in soup(["script"]):
|
for script in soup(["script"]):
|
||||||
content = script.text
|
if not script.string: # Skip empty scripts
|
||||||
|
continue
|
||||||
|
content = script.string
|
||||||
if "ytInitialData" in content:
|
if "ytInitialData" in content:
|
||||||
lines = re.split(r"\r?\n", content)
|
match = re.search(r"var ytInitialData = ({.*?});", content)
|
||||||
obj_start = lines[0].find("{")
|
if match:
|
||||||
obj_end = lines[0].rfind("}")
|
data = json.loads(match.group(1))
|
||||||
if obj_start >= 0 and obj_end >= 0:
|
attrdesc = self._findKey(data, "attributedDescriptionBodyText")
|
||||||
data = json.loads(lines[0][obj_start : obj_end + 1])
|
if attrdesc and isinstance(attrdesc, dict):
|
||||||
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
metadata["description"] = str(attrdesc.get("content", ""))
|
||||||
if attrdesc:
|
|
||||||
metadata["description"] = str(attrdesc["content"])
|
|
||||||
break
|
break
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
print(f"Error extracting description: {e}")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Start preparing the page
|
# Start preparing the page
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue