From cf512afe8e84237742bb20bbac7ac8d8a575fa74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?= <35225576+afuetterer@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:15:16 +0200 Subject: [PATCH] refactor: set html.parser in bs4 xhtml parsing call --- tika/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tika/pdf.py b/tika/pdf.py index d2c2fdaa..48c2736f 100644 --- a/tika/pdf.py +++ b/tika/pdf.py @@ -26,7 +26,7 @@ def text_from_pdf_pages(filename): # Read PDF file data = parser.from_file(filename, xmlContent=True) - xhtml_data = BeautifulSoup(data['content']) + xhtml_data = BeautifulSoup(data['content'], features="html.parser") for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})): # Parse PDF data using TIKA (xml/html) # It's faster and safer to create a new buffer than truncating it