Skip to content

Commit d7ec3e2

Browse files
committed
File ext + type detection improvements.
1 parent 3caa6f2 commit d7ec3e2

4 files changed

Lines changed: 20 additions & 6 deletions

File tree

ocr_service/processor/converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def _preprocess_doc(self, stream: bytes, file_name: str) -> bytes:
260260
os.fsync(tmp_pdf_file.fileno())
261261
pdf_stream = tmp_pdf_file.read()
262262

263-
if b"%PDF-" not in pdf_stream[:64]:
263+
if not pdf_stream.startswith(b"%PDF-"):
264264
self.log.warning("invalid pdf header for file %s", pdf_file_path)
265265
pdf_stream = b""
266266
else:

ocr_service/processor/processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def _process(self, stream: bytes, file_name: str) -> tuple[str, dict]:
4242
"""
4343

4444
file_type = detect_file_type(stream)
45-
file_name = normalise_file_name_with_ext(file_name, stream)
45+
file_name = normalise_file_name_with_ext(file_name, stream, file_type)
4646
ctx = ProcessContext(stream=stream, file_name=file_name, file_type=file_type)
4747
ctx.metadata["content-type"] = self.converter.resolve_content_type(file_type)
4848

ocr_service/tests/test_filename_handling.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ def test_unknown_binary_name_remains_extensionless(self):
1414
file_name = normalise_file_name_with_ext("request-id", b"\x00\x01\x02\x03")
1515
self.assertEqual(file_name, "request-id")
1616

17+
def test_detected_file_type_extension_is_used_when_name_has_no_suffix(self):
18+
file_type = Mock()
19+
file_type.extension = "docx"
20+
21+
file_name = normalise_file_name_with_ext("request-id", b"\x00\x01\x02\x03", file_type)
22+
23+
self.assertEqual(file_name, "request-id.docx")
24+
1725
def test_processor_passes_extensionless_unknown_name_to_converter(self):
1826
processor = Processor()
1927
processor.converter = Mock()

ocr_service/utils/utils.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ def detect_file_type(stream: bytes) -> object | None:
335335
return file_type
336336

337337

338-
def normalise_file_name_with_ext(file_name: str, stream: bytes) -> str:
338+
def normalise_file_name_with_ext(file_name: str, stream: bytes, file_type: object | None = None) -> str:
339339
"""Normalize filename and ensure an extension is present.
340340
341341
LibreOffice relies on a reasonable filename with an extension to select
@@ -346,6 +346,7 @@ def normalise_file_name_with_ext(file_name: str, stream: bytes) -> str:
346346
Args:
347347
file_name: Original file name (may be empty or extension-less).
348348
stream: File content used for extension inference.
349+
file_type: Optional previously detected file type descriptor.
349350
350351
Returns:
351352
str: Normalized file name with an extension.
@@ -361,20 +362,25 @@ def normalise_file_name_with_ext(file_name: str, stream: bytes) -> str:
361362
if ext:
362363
return base + ext
363364

364-
# 2) let filetype guess it from content
365+
# 2) prefer an already detected extension when available
366+
detected_ext = getattr(file_type, "extension", None)
367+
if detected_ext:
368+
return f"{base}.{str(detected_ext)}"
369+
370+
# 3) let filetype guess it from content
365371
guessed_ext = filetype.guess_extension(stream)
366372
if guessed_ext:
367373
return f"{base}.{guessed_ext}"
368374

369-
# 3) fallbacks for texty formats our filetype may not catch
375+
# 4) fallbacks for texty formats our filetype may not catch
370376
if is_file_type_html(stream):
371377
return base + ".html"
372378
if is_file_type_xml(stream):
373379
return base + ".xml"
374380
if is_file_type_rtf(stream):
375381
return base + ".rtf"
376382

377-
# 4) only tag as plain text when the content actually looks like text
383+
# 5) only tag as plain text when the content actually looks like text
378384
if is_file_content_plain_text(stream):
379385
return base + ".txt"
380386

0 commit comments

Comments
 (0)