#from pdf2image import convert_from_bytes, convert_from_path
from io import BytesIO
from PIL import Image
from pathlib import Path
import os, re #,tesserocr
from py_pdf_parser.loaders import load_file

def clean_text(text):
	text_clean = ''
	for char in text:
		if char.isalnum() or char in ['.',':','-',',',' ', '+', '@', ';', '\'', '"', '\t','\n']:
			text_clean += char
	return text_clean


def get_text_fitz(pdf):
	doc = load_file(pdf)
	text = []
	for i in doc.elements:
		text_cleaned = clean_text(i.text())
		if '\n' in text_cleaned:
			text.extend(text_cleaned.split('\n'))
		else:
			text.append(text_cleaned)
	text = [i.strip().replace(u'\xa0', u' ') for i in text]
	text = [re.sub(r'DocuSign Envelope ID:\s[A-Z0-9\-]+\b','',i).strip() for i in text]
	text = [i for i in text if i]
	#text = [i.strip() for i in text.split('\n')]
	return text

#def get_text_fitz(pdf, with_api=True):

def convert_pdf_to_images(pdf):
	images = convert_from_bytes(pdf)
	images_of_pages = []
	for page in images:
		with BytesIO() as f:
			page.save(f, format="jpeg")
			f.seek(0)
			pil_image = Image.open(f).convert("RGB")
			images_of_pages.append(pil_image)
	return images_of_pages

def get_text_tesseract(pdf):
	images = convert_pdf_to_images(pdf.read())
	file_text = []
	if len(images) > 0:
	    for idx, image in enumerate(images):
	        text = tesserocr.image_to_text(image)
	        file_text.append(text)
	    file_text = " ".join(file_text)
	    text = file_text.strip().replace(u'\xa0', u' ')
	    text = re.sub(r'DocuSign Envelope ID:\s[A-Z0-9\-]+\b','',text).strip()
	return text

def txt_from_pdf(file):
	text = get_text_fitz(file)
	if len(text)<10:
	    text = get_text_tesseract(file)
	return text