Update app.py
Browse files
app.py
CHANGED
|
@@ -32,7 +32,7 @@ import docx2txt
|
|
| 32 |
from PIL import Image
|
| 33 |
from PyPDF2 import PdfFileReader
|
| 34 |
import pdfplumber
|
| 35 |
-
|
| 36 |
# NLP Pkgs
|
| 37 |
from textblob import TextBlob
|
| 38 |
import spacy
|
|
@@ -41,6 +41,7 @@ import requests
|
|
| 41 |
import cv2
|
| 42 |
import numpy as np
|
| 43 |
import pytesseract
|
|
|
|
| 44 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 45 |
from PIL import Image
|
| 46 |
def read_pdf(file):
|
|
@@ -49,14 +50,17 @@ def read_pdf(file):
|
|
| 49 |
all_page_text = ""
|
| 50 |
for i in range(count):
|
| 51 |
page = pdfReader.getPage(i)
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
return all_page_text
|
| 55 |
|
| 56 |
#def read_pdf_with_pdfplumber(file):
|
| 57 |
# with pdfplumber.open(file) as pdf:
|
| 58 |
# page = pdf.pages[0]
|
| 59 |
-
# return page.extract_text()
|
| 60 |
st.title("Streamlit NLP APP")
|
| 61 |
@st.experimental_singleton
|
| 62 |
def text_analyzer(my_text):
|
|
@@ -107,7 +111,18 @@ def main():
|
|
| 107 |
img = Image.open(uploaded_photo)
|
| 108 |
img = img.save("img.png")
|
| 109 |
img = cv2.imread("img.png")
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
st.success(text)
|
| 112 |
elif camera_photo:
|
| 113 |
img = Image.open(camera_photo)
|
|
|
|
| 32 |
from PIL import Image
|
| 33 |
from PyPDF2 import PdfFileReader
|
| 34 |
import pdfplumber
|
| 35 |
+
from line_cor import mark_region
|
| 36 |
# NLP Pkgs
|
| 37 |
from textblob import TextBlob
|
| 38 |
import spacy
|
|
|
|
| 41 |
import cv2
|
| 42 |
import numpy as np
|
| 43 |
import pytesseract
|
| 44 |
+
|
| 45 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 46 |
from PIL import Image
|
| 47 |
def read_pdf(file):
|
|
|
|
| 50 |
all_page_text = ""
|
| 51 |
for i in range(count):
|
| 52 |
page = pdfReader.getPage(i)
|
| 53 |
+
image_name = "Page_" + str(i) + ".jpg"
|
| 54 |
+
page.save(image_name, "JPEG")
|
| 55 |
+
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
| 56 |
+
all_page_text += text + " " #page.extractText()
|
| 57 |
|
| 58 |
return all_page_text
|
| 59 |
|
| 60 |
#def read_pdf_with_pdfplumber(file):
|
| 61 |
# with pdfplumber.open(file) as pdf:
|
| 62 |
# page = pdf.pages[0]
|
| 63 |
+
# return page.extract_text()
|
| 64 |
st.title("Streamlit NLP APP")
|
| 65 |
@st.experimental_singleton
|
| 66 |
def text_analyzer(my_text):
|
|
|
|
| 111 |
img = Image.open(uploaded_photo)
|
| 112 |
img = img.save("img.png")
|
| 113 |
img = cv2.imread("img.png")
|
| 114 |
+
# get co-ordinates to crop the image
|
| 115 |
+
image, lc = mark_region(img)
|
| 116 |
+
c = lc[1]
|
| 117 |
+
# cropping image img = image[y0:y1, x0:x1]
|
| 118 |
+
img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
| 119 |
+
plt.figure(figsize=(10,10))
|
| 120 |
+
plt.imshow(img)
|
| 121 |
+
# convert the image to black and white for better OCR
|
| 122 |
+
ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
|
| 123 |
+
# pytesseract image to string to get results
|
| 124 |
+
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
| 125 |
+
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
| 126 |
st.success(text)
|
| 127 |
elif camera_photo:
|
| 128 |
img = Image.open(camera_photo)
|