Add LLM-based image description to PptxConverter

Signed-off-by: Hankyeol Kyung <kghnkl0103@gmail.com>
This commit is contained in:
Hankyeol Kyung 2024-12-26 17:27:03 +09:00
parent 125e206047
commit 7fe32073de
No known key found for this signature in database
GPG key ID: 0430C7F42578E222

View file

@ -768,6 +768,17 @@ class PptxConverter(HtmlConverter):
except Exception: except Exception:
pass pass
# Try describing the image using GPTV
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
alt_text += self._get_llm_description(
shape.image.blob,
llm_client,
llm_model,
prompt=kwargs.get("llm_prompt"),
).strip()
# A placeholder name # A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg" filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += ( md_content += (
@ -857,6 +868,31 @@ class PptxConverter(HtmlConverter):
separator = "|" + "|".join(["---"] * len(data[0])) + "|" separator = "|" + "|".join(["---"] * len(data[0])) + "|"
return md + "\n".join([header, separator] + markdown_table[1:]) return md + "\n".join([header, separator] + markdown_table[1:])
def _get_llm_description(self, image_blob, client, model, prompt=None):
if prompt is None or prompt.strip() == "":
prompt = "Write a caption for this image."
content_type = "image/jpeg"
image_base64 = base64.b64encode(image_blob).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
},
],
}
]
response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):
""" """