From cf512afe8e84237742bb20bbac7ac8d8a575fa74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?=
 <35225576+afuetterer@users.noreply.github.com>
Date: Mon, 27 Apr 2026 09:15:16 +0200
Subject: [PATCH] refactor: set html.parser in bs4 xhtml parsing call

---
 tika/pdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika/pdf.py b/tika/pdf.py
index d2c2fdaa..48c2736f 100644
--- a/tika/pdf.py
+++ b/tika/pdf.py
@@ -26,7 +26,7 @@ def text_from_pdf_pages(filename):
 
     # Read PDF file
     data = parser.from_file(filename, xmlContent=True)
-    xhtml_data = BeautifulSoup(data['content'])
+    xhtml_data = BeautifulSoup(data['content'], features="html.parser")
     for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
         # Parse PDF data using TIKA (xml/html)
         # It's faster and safer to create a new buffer than truncating it