1. 程式人生 > 實用技巧 >pytesseract結合PIL庫進行OCR識別

pytesseract結合PIL庫進行OCR識別

1.獲取需要OCR識別的圖片

from PIL import Image
import pytesseract

def screenshots_picture(driver,locator):
    '''
    擷取需要被ocr識別的圖片
    :param driver:瀏覽器driver
    :param locator: 元素
    :param fileName: 截圖檔名稱
    :param screenshots_fileName: 識別圖片檔名稱
    :return: 識別碼
    注意:
        # 如果是retina螢幕,必須要加這個不然,就會出現擷取驗證的圖錯誤
        # dpr = driver.execute_script('return window.devicePixelRatio')
        # im = Image.open(picture_name1)
        # img = im.crop((left * dpr, top * dpr, right * dpr, height * dpr))

    
''' try: # 擷取當前網頁,該網頁有我們需要的驗證碼 name = f'{time.time()}.png' fileName = filePictruePath(name) driver.save_screenshot(fileName) #定位到驗證碼的元素 imgelement = driver.find_element(*locator) # 獲取驗證碼x,y軸座標 location = imgelement.location x
= int(location['x']) y = int(location['y']) #獲取驗證碼的長寬 size = imgelement.size width = int(size['width']) height = int(size['height']) dpr = driver.execute_script('return window.devicePixelRatio') # 得到要被截圖的位置座標,通過兩點定位要截圖的位置 rangle = (x*dpr,y*dpr,(x+width)*dpr,(y+height)*dpr)
#開啟螢幕截圖 open_fileName = Image.open(fileName) # 使用Image的crop函式,從截圖中再次擷取我們需要的區域 screenshots = open_fileName.crop(rangle) #儲存已擷取的驗證碼圖片 ocr_name = f'{time.time()}ocr.png' screenshots_fileName = filePictruePath(ocr_name) screenshots.save(screenshots_fileName) return screenshots_fileName except Exception: return None

2.OCR識別圖片

def ocr_code(screenshots_fileName):
    '''
    ocr識別方法
    :param screenshots_fileName: 被識別的檔名稱
    :return: 識別資訊
    '''
    # 開啟儲存的圖片
    open_stream = Image.open(screenshots_fileName)
    # 使用pytesseract中的image_to_string方法獲取識別驗證碼
    identify_text = pytesseract.image_to_string(open_stream).strip()
    print(identify_text)
    # 過濾掉會受影響的符號
    identify_text = filter_str(identify_text)
    return identify_text

用到的方法:

def filePictruePath(name):
    '''
    生成檔案路徑
    :param name:
    :return:
    '''
    file_dir = f"{os.path.dirname(os.path.dirname(__file__))}/screenshot/"
    if os.path.exists(file_dir) and os.path.isdir(file_dir):
        pass
    else:
        os.mkdir(file_dir)
    return os.path.join(file_dir,name)

def filter_str(args):
    '''
    過濾字串中的無效字元
    :param args: 只留數字以及字串
    :return:
    '''
    new_str = str(args)
    new_str = ''.join(new_str.strip().split())
    str_list = []
    for i in new_str:
        if '0' <= i and i <= '9':
            str_list.append(i)
        elif i.upper() >= 'A' and i.upper() <= 'Z':
            str_list.append(i)
    return ''.join(str_list)

會出現下面的錯誤:

具體解決辦法見:https://blog.csdn.net/qq_31362767/article/details/107891185