To extract text from scanned pdf first we need to download tesseract and poppler.
Code to Extract Text From scanned PDF using Python
from PIL import Image
from pytesseract import image_to_string
import pytesseract
from pdf2image import convert_from_path
def convert_pdf_to_img(pdf_file):
return convert_from_path(pdf_file,poppler_path=".../Release-23.07.0-0/poppler-23.07.0/Library/bin")
def convert_image_to_text(file):
pytesseract.pytesseract.tesseract_cmd ="/Files/Tesseract-OCR/tesseract.exe"
text=image_to_string(file)
return(text)
def get_text_from_any_pdf(pdf_file):
images=convert_pdf_to_img(pdf_file)
final_text=""
for pg,img in enumerate(images):
final_text+=convert_image_to_text(img)
return final_text
filepath='../Page_14.pdf'
get_text_from_any_pdf(filepath)
Output:Text will be extracted from pdf.