Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from doctr.io import DocumentFile | |
| from reportlab.pdfbase import pdfmetrics | |
| from reportlab.pdfbase.ttfonts import TTFont | |
| from PIL import Image | |
| import pytesseract | |
| import utils | |
| # Register a Unicode-compatible font | |
| fontname = "Ubuntu" | |
| fontpath = "./Ubuntu-Regular.ttf" | |
| reco_arch = "kz_latest.pt" | |
| pdfmetrics.registerFont(TTFont(fontname, fontpath)) | |
| use_pytesseract = True | |
| def main(): | |
| """Построение интерфейса Streamlit""" | |
| # Широкий режим - must be first! | |
| st.set_page_config(layout="wide") | |
| # Hide Streamlit elements and set background | |
| hide_st_style = """ | |
| <style> | |
| #MainMenu {visibility: hidden;} | |
| footer {visibility: hidden;} | |
| header {visibility: hidden;} | |
| </style> | |
| """ | |
| st.markdown(hide_st_style, unsafe_allow_html=True) | |
| # Дизайн интерфейса | |
| st.title("Қазақша жазылған құжаттардың OCR") | |
| # Move file upload to top | |
| uploaded_file = st.file_uploader( | |
| "Файлдарды жүктеңіз", type=["pdf", "png", "jpeg", "jpg"] | |
| ) | |
| # Новая строка | |
| st.write("\n") | |
| # Установка колонок | |
| cols = st.columns((1, 1)) | |
| cols[0].subheader("Бастапқы бет") | |
| cols[1].subheader("Мәтіннің біріктірілген нұсқасы") | |
| if uploaded_file is not None: | |
| print(uploaded_file.name) | |
| if uploaded_file.name.lower().endswith(".pdf"): | |
| doc = DocumentFile.from_pdf(uploaded_file.read()) | |
| else: | |
| doc = DocumentFile.from_images(uploaded_file.read()) | |
| page_idx = ( | |
| st.selectbox("Бетті таңдау", [idx + 1 for idx in range(len(doc))]) | |
| - 1 | |
| ) | |
| page = doc[page_idx] | |
| cols[0].image(page) | |
| with st.spinner("Модельді жүктеу..."): | |
| predictor = utils.get_ocr_predictor( | |
| reco_arch=reco_arch, | |
| ) | |
| with st.spinner("Талдау..."): | |
| out = predictor([page]) | |
| page_export = out.pages[0].export() | |
| (coordinates, _, _) = utils.page_to_coordinates(page_export) | |
| # Пропуск изображения через модель | |
| ''' | |
| boxes_with_labels = utils.draw_boxes_with_labels( | |
| page, coordinates, font_path="./Ubuntu-Regular.ttf" | |
| ) | |
| cols[1].image(boxes_with_labels) | |
| ''' | |
| # Отображение объединенного текста | |
| final_text = utils.ocr_to_txt(coordinates) | |
| cols[1].text_area("Мәтіннің біріктірілген нұсқасы:", final_text, height=500) | |
| # Use pytesseract if checkbox is selected | |
| if use_pytesseract: | |
| if uploaded_file.name.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| image = Image.open(uploaded_file) | |
| ocr_text = pytesseract.image_to_string(image, lang="kaz+eng+rus") | |
| # Create a collapsible block for OCR results | |
| with st.expander("OCR нәтижесі (pytesseract)"): | |
| st.text_area("Тексеру нәтижесі:", ocr_text, height=300) | |
| else: | |
| st.warning("OCR тек суреттер үшін қол жетімді.") | |
| if __name__ == "__main__": | |
| main() | |