#from pdf2image import convert_from_bytes, convert_from_path from io import BytesIO from PIL import Image from pathlib import Path import os, re #,tesserocr from py_pdf_parser.loaders import load_file def clean_text(text): text_clean = '' for char in text: if char.isalnum() or char in ['.',':','-',',',' ', '+', '@', ';', '\'', '"', '\t','\n']: text_clean += char return text_clean def get_text_fitz(pdf): doc = load_file(pdf) text = [] for i in doc.elements: text_cleaned = clean_text(i.text()) if '\n' in text_cleaned: text.extend(text_cleaned.split('\n')) else: text.append(text_cleaned) text = [i.strip().replace(u'\xa0', u' ') for i in text] text = [re.sub(r'DocuSign Envelope ID:\s[A-Z0-9\-]+\b','',i).strip() for i in text] text = [i for i in text if i] #text = [i.strip() for i in text.split('\n')] return text #def get_text_fitz(pdf, with_api=True): def convert_pdf_to_images(pdf): images = convert_from_bytes(pdf) images_of_pages = [] for page in images: with BytesIO() as f: page.save(f, format="jpeg") f.seek(0) pil_image = Image.open(f).convert("RGB") images_of_pages.append(pil_image) return images_of_pages def get_text_tesseract(pdf): images = convert_pdf_to_images(pdf.read()) file_text = [] if len(images) > 0: for idx, image in enumerate(images): text = tesserocr.image_to_text(image) file_text.append(text) file_text = " ".join(file_text) text = file_text.strip().replace(u'\xa0', u' ') text = re.sub(r'DocuSign Envelope ID:\s[A-Z0-9\-]+\b','',text).strip() return text def txt_from_pdf(file): text = get_text_fitz(file) if len(text)<10: text = get_text_tesseract(file) return text