将文件夹中的多个扫描的PDF文件转换为纯文本格式
import os
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Admin\Anaconda3\Menu\tesseract.exe'
d = r'C:\Users\Admin\Downloads\pdf'
r = d.lstrip('\u202aC:')
for path in os.listdir(r):
full_path = os.path.join(d,path)
for i in full_path:
print(i)
pages = convert_from_path(i,500)
image_counter = 1