Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Sleeping

App Files Files Community

Soumen commited on Nov 25, 2022

Commit

8c11fa3

1 Parent(s): ed0375d

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -29

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ import os
 #os.system('gunzip ben.traineddata.gz ')
 #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
 #os.system('pip install -q pytesseract')
 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
@@ -49,35 +50,35 @@ import line_cor
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
-def read_pdf(file):
-    images=pdf2image.convert_from_path(file)
-    # print(type(images))
-    # pdfReader = PdfFileReader(file)
-    # count = pdfReader.numPages
-    all_page_text = ""
-    for page in images:
-       # page = pdfReader.getPage(i)
-        #img = Image.open(page)
-        img = Image.open(page)
-        img = img.save("img.png")
-        image_name = cv2.imread("img.png")
-        # get co-ordinates to cr
-        text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
-        all_page_text += text + " " #page.extractText()
-    return all_page_text
 def read_pdf_with_pdfplumber(file):
-#     all_page_text=" "
-# #     all_page_text = ""
-#     #with pdfplumber.open(file) as pdf:
-# 	   # page = pdf.pages[0]
-#     ge=page.to_image()
-#     img = Image.open(ge)
-#     img = img.save("img.png")
-#     image_name = cv2.imread("img.png")
 # get co-ordinates to c
-        #return page.extract_text()
-        # get co-ordinates to cr
-## get co-ordinates to cr
     text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
     all_page_text += text + " " #page.extractText()
     return all_page_text
@@ -139,11 +140,11 @@ def main():
             #st.success(*lc)
             c = lc
             # cropping image img = image[y0:y1, x0:x1]
-            imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
             #plt.figure(figsize=(10,10))
            # plt.imshow(img)
             # convert the image to black and white for better OCR
-            ret,thresh1 = cv2.threshold(imgg,120,255,cv2.THRESH_BINARY)
             # pytesseract image to string to get results
             text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
             #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)

 #os.system('gunzip ben.traineddata.gz ')
 #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
 #os.system('pip install -q pytesseract')
+os.system('conda install -c conda-forge poppler')
 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
+# def read_pdf(file):
+#     images=pdf2image.convert_from_path(file)
+#     # print(type(images))
+#     # pdfReader = PdfFileReader(file)
+#     # count = pdfReader.numPages
+#     all_page_text = ""
+#     for page in images:
+#        # page = pdfReader.getPage(i)
+#         #img = Image.open(page)
+#         img = Image.open(page)
+#         img = img.save("img.png")
+#         image_name = cv2.imread("img.png")
+#         # get co-ordinates to cr
+#         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
+#         all_page_text += text + " " #page.extractText()
+#     return all_page_text
 def read_pdf_with_pdfplumber(file):
+    all_page_text=" "
+#     all_page_text = ""
+    with pdfplumber.open(file) as pdf:
+	    page = pdf.pages[0]
+    ge=page.to_image()
+    img = Image.open(ge)
+    img = img.save("img.png")
+    image_name = cv2.imread("img.png")
 # get co-ordinates to c
+#         return page.extract_text()
+#         get co-ordinates to cr
+# # get co-ordinates to cr
     text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
     all_page_text += text + " " #page.extractText()
     return all_page_text
             #st.success(*lc)
             c = lc
             # cropping image img = image[y0:y1, x0:x1]
+            #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
             #plt.figure(figsize=(10,10))
            # plt.imshow(img)
             # convert the image to black and white for better OCR
+            ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
             # pytesseract image to string to get results
             text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
             #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)