Update app.py
Browse files
app.py
CHANGED
|
@@ -50,38 +50,38 @@ import line_cor
|
|
| 50 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 51 |
from PIL import Image
|
| 52 |
@st.experimental_singleton
|
| 53 |
-
|
| 54 |
# images=pdf2image.convert_from_path(file)
|
| 55 |
# # print(type(images))
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
#
|
| 62 |
-
#
|
| 63 |
-
#
|
| 64 |
-
#
|
| 65 |
-
#
|
| 66 |
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 67 |
-
|
| 68 |
-
# return all_page_text
|
| 69 |
-
def read_pdf_with_pdfplumber(file):
|
| 70 |
-
all_page_text=" "
|
| 71 |
-
# all_page_text = ""
|
| 72 |
-
with pdfplumber.open(file) as pdf:
|
| 73 |
-
page = pdf.pages[0]
|
| 74 |
-
ge=page.to_image()
|
| 75 |
-
img = Image.open(ge)
|
| 76 |
-
img = img.save("img.png")
|
| 77 |
-
image_name = cv2.imread("img.png")
|
| 78 |
-
# get co-ordinates to c
|
| 79 |
-
# return page.extract_text()
|
| 80 |
-
# get co-ordinates to cr
|
| 81 |
-
# # get co-ordinates to cr
|
| 82 |
-
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 83 |
-
all_page_text += text + " " #page.extractText()
|
| 84 |
return all_page_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
st.title("Streamlit NLP APP")
|
| 86 |
@st.experimental_singleton
|
| 87 |
def text_analyzer(my_text):
|
|
@@ -119,7 +119,7 @@ def main():
|
|
| 119 |
st.subheader("Please, feed your image/text, features/services will appear automatically!")
|
| 120 |
message = st.text_input("Type your text here!")
|
| 121 |
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
|
| 122 |
-
uploaded_photo = st.file_uploader("Upload
|
| 123 |
if "photo" not in st.session_state:
|
| 124 |
st.session_state["photo"]="not done"
|
| 125 |
if st.session_state["photo"]=="done" or message:
|
|
@@ -128,26 +128,26 @@ def main():
|
|
| 128 |
#file = uploaded_photo.read() # Read the data
|
| 129 |
#image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
|
| 130 |
#image_result.write(file)
|
| 131 |
-
text =
|
| 132 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
| 133 |
st.success(text)
|
| 134 |
elif uploaded_photo.type != "application/image":
|
| 135 |
img = Image.open(uploaded_photo)
|
| 136 |
img = img.save("img.png")
|
| 137 |
-
|
| 138 |
# get co-ordinates to crop the image
|
| 139 |
-
imag, lc = line_cor.mark_region(imge)
|
| 140 |
#st.success(*lc)
|
| 141 |
-
|
| 142 |
# cropping image img = image[y0:y1, x0:x1]
|
| 143 |
#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
| 144 |
#plt.figure(figsize=(10,10))
|
| 145 |
# plt.imshow(img)
|
| 146 |
# convert the image to black and white for better OCR
|
| 147 |
-
ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
|
| 148 |
# pytesseract image to string to get results
|
| 149 |
-
text = str(pytesseract.image_to_string(
|
| 150 |
-
|
| 151 |
st.success(text)
|
| 152 |
elif camera_photo:
|
| 153 |
img = Image.open(camera_photo)
|
|
|
|
| 50 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 51 |
from PIL import Image
|
| 52 |
@st.experimental_singleton
|
| 53 |
+
def read_pdf(file):
|
| 54 |
# images=pdf2image.convert_from_path(file)
|
| 55 |
# # print(type(images))
|
| 56 |
+
pdfReader = PdfFileReader(file)
|
| 57 |
+
count = pdfReader.numPages
|
| 58 |
+
all_page_text = ""
|
| 59 |
+
for i range(count):
|
| 60 |
+
page = pdfReader.getPage(i)
|
| 61 |
+
# img = Image.open(page)
|
| 62 |
+
# img = Image.open(page)
|
| 63 |
+
# img = img.save("img.png")
|
| 64 |
+
# image_name = cv2.imread("img.png")
|
| 65 |
+
# # get co-ordinates to cr
|
| 66 |
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 67 |
+
all_page_text += page.extractText()+" "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
return all_page_text
|
| 69 |
+
# def read_pdf_with_pdfplumber(file):
|
| 70 |
+
# all_page_text=" "
|
| 71 |
+
# # all_page_text = ""
|
| 72 |
+
# with pdfplumber.open(file) as pdf:
|
| 73 |
+
# page = pdf.pages[0]
|
| 74 |
+
# ge=page.to_image()
|
| 75 |
+
# img = Image.open(ge)
|
| 76 |
+
# img = img.save("img.png")
|
| 77 |
+
# image_name = cv2.imread("img.png")
|
| 78 |
+
# # get co-ordinates to c
|
| 79 |
+
# # return page.extract_text()
|
| 80 |
+
# # get co-ordinates to cr
|
| 81 |
+
# # # get co-ordinates to cr
|
| 82 |
+
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 83 |
+
# all_page_text += text + " " #page.extractText()
|
| 84 |
+
# return all_page_text
|
| 85 |
st.title("Streamlit NLP APP")
|
| 86 |
@st.experimental_singleton
|
| 87 |
def text_analyzer(my_text):
|
|
|
|
| 119 |
st.subheader("Please, feed your image/text, features/services will appear automatically!")
|
| 120 |
message = st.text_input("Type your text here!")
|
| 121 |
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
|
| 122 |
+
uploaded_photo = st.file_uploader("Upload Bangla or English Image/ English PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
|
| 123 |
if "photo" not in st.session_state:
|
| 124 |
st.session_state["photo"]="not done"
|
| 125 |
if st.session_state["photo"]=="done" or message:
|
|
|
|
| 128 |
#file = uploaded_photo.read() # Read the data
|
| 129 |
#image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
|
| 130 |
#image_result.write(file)
|
| 131 |
+
text = read_pdf(uploaded_photo)
|
| 132 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
| 133 |
st.success(text)
|
| 134 |
elif uploaded_photo.type != "application/image":
|
| 135 |
img = Image.open(uploaded_photo)
|
| 136 |
img = img.save("img.png")
|
| 137 |
+
img = cv2.imread("img.png")
|
| 138 |
# get co-ordinates to crop the image
|
| 139 |
+
#imag, lc = line_cor.mark_region(imge)
|
| 140 |
#st.success(*lc)
|
| 141 |
+
# c = lc
|
| 142 |
# cropping image img = image[y0:y1, x0:x1]
|
| 143 |
#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
| 144 |
#plt.figure(figsize=(10,10))
|
| 145 |
# plt.imshow(img)
|
| 146 |
# convert the image to black and white for better OCR
|
| 147 |
+
#ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
|
| 148 |
# pytesseract image to string to get results
|
| 149 |
+
#text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
| 150 |
+
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
| 151 |
st.success(text)
|
| 152 |
elif camera_photo:
|
| 153 |
img = Image.open(camera_photo)
|