From b122931185fb3e1af31fd6b836f16e6475d255be Mon Sep 17 00:00:00 2001 From: Michael Gmelin Date: Mon, 30 Nov 2020 22:54:22 +0100 Subject: [PATCH] Trim whitespace characters when getting text from PDF. pdftotext[0] returns one entry per empty page (newline + whitespace). When combined with newline in join, this will create a text that grows with each page. When getting over a certain document size, this exceeds the 50 character limit for skipping OCRing a page (unless PAPERLESS_OCR_ALWAYS is enabled), resulting in larger documents not being OCRred anymore, but consisting of a couple of whitespace lines. By stripping the result of pdftotext, text only consisting of such whitespace is shortened, so that OCR can still happen. Text retrieved from pdftotext is a bit nicer that way too as a side-effect. Also considered trimming each page and leaving empty ones out, but simply stripping the result seemed less intrusive. [0]Tested with pdftotext 2.1.4. --- src/paperless_tesseract/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index fd707783..a88c8d15 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -285,4 +285,4 @@ def get_text_from_pdf(pdf_file): except pdftotext.Error: return "" - return "\n".join(pdf) + return "\n".join(pdf).strip()