Update app.py
Browse files
app.py
CHANGED
|
@@ -25,6 +25,7 @@ import os
|
|
| 25 |
#os.system('gunzip ben.traineddata.gz ')
|
| 26 |
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
|
| 27 |
#os.system('pip install -q pytesseract')
|
|
|
|
| 28 |
import streamlit as st
|
| 29 |
import torch
|
| 30 |
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
|
|
@@ -49,35 +50,35 @@ import line_cor
|
|
| 49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 50 |
from PIL import Image
|
| 51 |
@st.experimental_singleton
|
| 52 |
-
def read_pdf(file):
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
def read_pdf_with_pdfplumber(file):
|
| 69 |
-
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
# get co-ordinates to c
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 82 |
all_page_text += text + " " #page.extractText()
|
| 83 |
return all_page_text
|
|
@@ -139,11 +140,11 @@ def main():
|
|
| 139 |
#st.success(*lc)
|
| 140 |
c = lc
|
| 141 |
# cropping image img = image[y0:y1, x0:x1]
|
| 142 |
-
imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
| 143 |
#plt.figure(figsize=(10,10))
|
| 144 |
# plt.imshow(img)
|
| 145 |
# convert the image to black and white for better OCR
|
| 146 |
-
ret,thresh1 = cv2.threshold(
|
| 147 |
# pytesseract image to string to get results
|
| 148 |
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
| 149 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
|
|
|
| 25 |
#os.system('gunzip ben.traineddata.gz ')
|
| 26 |
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
|
| 27 |
#os.system('pip install -q pytesseract')
|
| 28 |
+
os.system('conda install -c conda-forge poppler')
|
| 29 |
import streamlit as st
|
| 30 |
import torch
|
| 31 |
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
|
|
|
|
| 50 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 51 |
from PIL import Image
|
| 52 |
@st.experimental_singleton
|
| 53 |
+
# def read_pdf(file):
|
| 54 |
+
# images=pdf2image.convert_from_path(file)
|
| 55 |
+
# # print(type(images))
|
| 56 |
+
# # pdfReader = PdfFileReader(file)
|
| 57 |
+
# # count = pdfReader.numPages
|
| 58 |
+
# all_page_text = ""
|
| 59 |
+
# for page in images:
|
| 60 |
+
# # page = pdfReader.getPage(i)
|
| 61 |
+
# #img = Image.open(page)
|
| 62 |
+
# img = Image.open(page)
|
| 63 |
+
# img = img.save("img.png")
|
| 64 |
+
# image_name = cv2.imread("img.png")
|
| 65 |
+
# # get co-ordinates to cr
|
| 66 |
+
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 67 |
+
# all_page_text += text + " " #page.extractText()
|
| 68 |
+
# return all_page_text
|
| 69 |
def read_pdf_with_pdfplumber(file):
|
| 70 |
+
all_page_text=" "
|
| 71 |
+
# all_page_text = ""
|
| 72 |
+
with pdfplumber.open(file) as pdf:
|
| 73 |
+
page = pdf.pages[0]
|
| 74 |
+
ge=page.to_image()
|
| 75 |
+
img = Image.open(ge)
|
| 76 |
+
img = img.save("img.png")
|
| 77 |
+
image_name = cv2.imread("img.png")
|
| 78 |
# get co-ordinates to c
|
| 79 |
+
# return page.extract_text()
|
| 80 |
+
# get co-ordinates to cr
|
| 81 |
+
# # get co-ordinates to cr
|
| 82 |
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 83 |
all_page_text += text + " " #page.extractText()
|
| 84 |
return all_page_text
|
|
|
|
| 140 |
#st.success(*lc)
|
| 141 |
c = lc
|
| 142 |
# cropping image img = image[y0:y1, x0:x1]
|
| 143 |
+
#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
| 144 |
#plt.figure(figsize=(10,10))
|
| 145 |
# plt.imshow(img)
|
| 146 |
# convert the image to black and white for better OCR
|
| 147 |
+
ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
|
| 148 |
# pytesseract image to string to get results
|
| 149 |
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
| 150 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|