Spaces:

LRU1
/

lec2note

Sleeping

App Files Files Community

LRU1 commited on Sep 9

Commit

34365ef

1 Parent(s): 3eb1399

fix the language problem

Browse files

Files changed (8) hide show

README.md +61 -0
app.py +6 -1
lec2note/ingestion/__pycache__/whisper_runner.cpython-310.pyc +0 -0
lec2note/ingestion/whisper_runner.py +5 -5
lec2note/processing/__pycache__/processor.cpython-310.pyc +0 -0
lec2note/processing/processor.py +1 -1
lec2note/synthesis/__pycache__/assembler.cpython-310.pyc +0 -0
lec2note/synthesis/assembler.py +1 -0

README.md CHANGED Viewed

@@ -11,3 +11,64 @@ short_description: Using for automatically generating notes from video lectures
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+### Project Overview
+Lec2Note is an **automatic lecture-to-note generator**. Upload a lecture video (MP4/MKV/AVI) and receive a well-formatted Markdown study note containing:
+- **ASR transcription** powered by OpenAI Whisper.
+- **Video segmentation** using semantic & visual cues.
+- **LLM summarisation** (e.g. GPT-4) for each segment, extracting key points, formulas and insights.
+- **Image extraction** of key frames to illustrate the note.
+- **Markdown assembly** into a single readable document.
+### Installation
+```bash
+# Requires Python ≥ 3.10
+git clone https://github.com/your-name/Lec2Note.git
+cd Lec2Note
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+```
+GPU inference: ensure CUDA and the matching PyTorch build are installed.
+### Quick Start
+#### 1. Web UI
+Navigate to `https://huggingface.co/spaces/LRU1/lec2note` .
+#### 2. CLI
+```bash
+python -m lec2note.scripts.run_pipeline --video path/to/lecture.mp4 --output notes.md
+```
+#### 3. Required Environment Variables
+```bash
+export OPENAI_API_KEY=your_openai_api_key
+export REPLICATE_API_TOKEN=your_replicate_api_token
+export LOG_LEVEL=DEBUG(optional)
+export AUDIO2TEXT_LOCAL=true|false(optional)
+```
+### Directory Structure
+```text
+Lec2Note/
+├── app.py                  # Streamlit front-end
+├── lec2note/
+│   ├── ingestion/          # Audio/video preprocessing & ASR
+│   ├── segmentation/       # Semantic + visual segmentation
+│   ├── processing/         # LLM summarisation & note generation
+│   ├── synthesis/          # Markdown assembly
+│   └── scripts/            # CLI entry points
+└── tests/                  # Test suite
+```
+### Environment Variables
+Some modules require the following environment variables:
+- `OPENAI_API_KEY`: OpenAI access token.
+- `WHISPER_MODEL`: Whisper model name, default `base`.
+### Contributing
+Pull requests and issues are welcome! See `DEVELOPER_GUIDE.md` for code conventions and workflow.
+### License
+Released under the Apache-2.0 license.

app.py CHANGED Viewed

@@ -2,11 +2,16 @@ import streamlit as st
 from pathlib import Path
 import tempfile, subprocess, threading, queue
 import textwrap
 st.set_page_config(page_title="Lec2Note2 – Lecture-to-Notes", layout="wide")
 st.title("📝 Lec2Note – Automatic Lecture Notes Generator")
 st.markdown(
     textwrap.dedent(
         """
@@ -85,7 +90,7 @@ if run_btn and video_file:
         st.success("✅ Notes generated!")
         md_content = output_md.read_text()
         with st.container(border=True):
-            st.markdown(md_content)
         st.download_button(
             label="💾 Download notes.md",
             data=md_content,

 from pathlib import Path
 import tempfile, subprocess, threading, queue
 import textwrap
+import streamlit.components.v1 as components
 st.set_page_config(page_title="Lec2Note2 – Lecture-to-Notes", layout="wide")
 st.title("📝 Lec2Note – Automatic Lecture Notes Generator")
+# Inject MathJax once for LaTeX rendering
+MATHJAX = "<script src='https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-svg.js'></script>"
+components.html(MATHJAX, height=0)
 st.markdown(
     textwrap.dedent(
         """
         st.success("✅ Notes generated!")
         md_content = output_md.read_text()
         with st.container(border=True):
+            st.markdown(md_content, unsafe_allow_html=True)
         st.download_button(
             label="💾 Download notes.md",
             data=md_content,

lec2note/ingestion/__pycache__/whisper_runner.cpython-310.pyc CHANGED Viewed

Binary files a/lec2note/ingestion/__pycache__/whisper_runner.cpython-310.pyc and b/lec2note/ingestion/__pycache__/whisper_runner.cpython-310.pyc differ

lec2note/ingestion/whisper_runner.py CHANGED Viewed

@@ -17,10 +17,10 @@ __all__ = ["WhisperRunner"]
 class WhisperRunner:  # noqa: D101
-    model_name: str = "base"
     @classmethod
-    def transcribe(cls, audio_fp: str | Path, lang: str = "zh") -> List[Dict[str, Any]]:
         """Transcribe ``audio_fp`` and return list with start/end/text.
         Notes
@@ -32,7 +32,7 @@ class WhisperRunner:  # noqa: D101
         sub_path=audio_path.with_suffix(".json")
         if sub_path.exists():
             logger.info("[Whisper] loading exisisting subtitles.")
-            with open(sub_path, "r") as f:
                 return json.load(f)
         if not audio_path.exists():
             raise FileNotFoundError(audio_path)
@@ -92,6 +92,6 @@ class WhisperRunner:  # noqa: D101
             }
             for seg in segments
         ]
-        with open(sub_path, "w") as f:
-            json.dump(results, f, indent=2)
         return results

 class WhisperRunner:  # noqa: D101
+    model_name: str = "large-v3"
     @classmethod
+    def transcribe(cls, audio_fp: str | Path, lang: str = None) -> List[Dict[str, Any]]:
         """Transcribe ``audio_fp`` and return list with start/end/text.
         Notes
         sub_path=audio_path.with_suffix(".json")
         if sub_path.exists():
             logger.info("[Whisper] loading exisisting subtitles.")
+            with open(sub_path, "r", encoding="utf-8") as f:
                 return json.load(f)
         if not audio_path.exists():
             raise FileNotFoundError(audio_path)
             }
             for seg in segments
         ]
+        with open(sub_path, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
         return results

lec2note/processing/__pycache__/processor.cpython-310.pyc CHANGED Viewed

Binary files a/lec2note/processing/__pycache__/processor.cpython-310.pyc and b/lec2note/processing/__pycache__/processor.cpython-310.pyc differ

lec2note/processing/processor.py CHANGED Viewed

@@ -69,7 +69,7 @@ class Processor:  # noqa: D101
             "    - For **tables**, recreate them using Markdown table syntax.\n"
             "    - For **code**, use Markdown code blocks with appropriate language identifiers.\n\n"
             "3.  **Structure and Format**: Organize the notes logically. Use headings, subheadings, lists, and bold text to create a clear, readable, and well-structured document.\n\n"
-            "4.  **Language**: The notes should align with the subtitles.\n\n"
             "5.  **Image Mapping**: Stop referencing the images and try to use formulas, tables, code snippets, or important diagrams to describe the images.\n\n"
             "---BEGIN LECTURE MATERIALS---\n"
             f"**Subtitles (placeholders inserted)**:\n{placeholder_subs}"

             "    - For **tables**, recreate them using Markdown table syntax.\n"
             "    - For **code**, use Markdown code blocks with appropriate language identifiers.\n\n"
             "3.  **Structure and Format**: Organize the notes logically. Use headings, subheadings, lists, and bold text to create a clear, readable, and well-structured document.\n\n"
+            "4.  **Language**: The notes'language should align with the subtitles!!!.\n\n"
             "5.  **Image Mapping**: Stop referencing the images and try to use formulas, tables, code snippets, or important diagrams to describe the images.\n\n"
             "---BEGIN LECTURE MATERIALS---\n"
             f"**Subtitles (placeholders inserted)**:\n{placeholder_subs}"

lec2note/synthesis/__pycache__/assembler.cpython-310.pyc CHANGED Viewed

Binary files a/lec2note/synthesis/__pycache__/assembler.cpython-310.pyc and b/lec2note/synthesis/__pycache__/assembler.cpython-310.pyc differ

lec2note/synthesis/assembler.py CHANGED Viewed

@@ -52,6 +52,7 @@ class Assembler:  # noqa: D101
                                         "1.  **De-duplicate and Consolidate:** Identify all repetitive definitions and explanations. Merge them into a single, comprehensive section for each core concept. \n"
                                         "2.  **Synthesize and Enhance:** Where different fragments explain the same concept with slightly different examples or details (e.g., one note uses a 'cheetah' example, another uses a 'robot'), synthesize these details to create a richer, more complete explanation under a single heading.\n"
                                         "3.  **Polish and Format:** Ensure the final text is grammatically correct, flows naturally, and uses consistent, clean Markdown formatting (e.g., for tables, code blocks, and mathematical notation).\n\n"
                                         "**Constraint:** Ensure all unique concepts and key details from the original notes are preserved in the final document. The goal is to lose redundancy, not information.\n\n"
                                         "Here are the fragmented notes to process:\n\n"
                                         f"{raw_md}"

                                         "1.  **De-duplicate and Consolidate:** Identify all repetitive definitions and explanations. Merge them into a single, comprehensive section for each core concept. \n"
                                         "2.  **Synthesize and Enhance:** Where different fragments explain the same concept with slightly different examples or details (e.g., one note uses a 'cheetah' example, another uses a 'robot'), synthesize these details to create a richer, more complete explanation under a single heading.\n"
                                         "3.  **Polish and Format:** Ensure the final text is grammatically correct, flows naturally, and uses consistent, clean Markdown formatting (e.g., for tables, code blocks, and mathematical notation).\n\n"
+                                        "4.  **Language:** The notes' language should align with the subtitles!!!.\n\n"
                                         "**Constraint:** Ensure all unique concepts and key details from the original notes are preserved in the final document. The goal is to lose redundancy, not information.\n\n"
                                         "Here are the fragmented notes to process:\n\n"
                                         f"{raw_md}"