sentence
Browse files- DEVELOPER_GUIDE.md +96 -117
- lec2note/__pycache__/types.cpython-310.pyc +0 -0
- lec2note/api/main.py +8 -16
- lec2note/ingestion/__pycache__/audio_extractor.cpython-310.pyc +0 -0
- lec2note/ingestion/__pycache__/audio_extractor.cpython-313.pyc +0 -0
- lec2note/ingestion/__pycache__/whisper_runner.cpython-313.pyc +0 -0
- lec2note/ingestion/audio_extractor.py +4 -2
- lec2note/processing/__pycache__/processor.cpython-310.pyc +0 -0
- lec2note/processing/processor.py +1 -1
- lec2note/scripts/__pycache__/run_pipeline.cpython-310.pyc +0 -0
- lec2note/scripts/__pycache__/run_pipeline.cpython-313.pyc +0 -0
- lec2note/scripts/run_pipeline.py +2 -10
- lec2note/segmentation/__pycache__/chunk_merger.cpython-310.pyc +0 -0
- lec2note/segmentation/__pycache__/semantic_segmenter.cpython-310.pyc +0 -0
- lec2note/segmentation/__pycache__/sentence_chunker.cpython-310.pyc +0 -0
- lec2note/segmentation/chunk_merger.py +68 -0
- lec2note/segmentation/sentence_chunker.py +80 -0
- lec2note/segmentation/visual_merger.py +63 -0
- lec2note/synthesis/__pycache__/assembler.cpython-310.pyc +0 -0
- lec2note/synthesis/assembler.py +3 -6
- lec2note/utils/__pycache__/logging_config.cpython-313.pyc +0 -0
- lec2note/vision/__pycache__/frame_extractor.cpython-310.pyc +0 -0
- lec2note/vision/__pycache__/image_sampler.cpython-310.pyc +0 -0
- lec2note/vision/__pycache__/keyframe_extractor.cpython-310.pyc +0 -0
- lec2note/vision/frame_extractor.py +76 -0
- lec2note/vision/image_comparator.py +58 -0
- lec2note/vision/image_sampler.py +28 -0
- lec2note/vision/keyframe_extractor.py +24 -0
DEVELOPER_GUIDE.md
CHANGED
|
@@ -1,149 +1,132 @@
|
|
| 1 |
-
# Lec2Note 开发者指南 (
|
| 2 |
|
| 3 |
## 1. 项目概述
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
### 核心流程
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
- 学术课程录播
|
| 20 |
-
-
|
| 21 |
-
- 企业内部培训视频
|
|
|
|
|
|
|
| 22 |
|
| 23 |
## 2. 技术栈
|
| 24 |
|
| 25 |
-
| 层级
|
| 26 |
-
|
| 27 |
-
|
|
| 28 |
-
|
|
| 29 |
-
|
|
| 30 |
-
|
|
| 31 |
-
|
|
| 32 |
-
|
|
| 33 |
-
|
|
| 34 |
-
|
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
## 3. 目录结构与模块划分
|
| 38 |
-
|
| 39 |
```text
|
| 40 |
Lec2Note/
|
| 41 |
-
├── docs/
|
| 42 |
-
├── lec2note/
|
| 43 |
-
│ ├── ingestion/
|
| 44 |
-
│
|
| 45 |
-
│ │
|
| 46 |
-
│ ├──
|
| 47 |
-
│ │ ├──
|
| 48 |
│ │ └── ocr_processor.py
|
| 49 |
-
│ ├── segmentation/
|
| 50 |
-
│ │ ├──
|
| 51 |
-
│ │ └──
|
| 52 |
-
│ ├── processing/
|
| 53 |
-
│ ├── synthesis/
|
| 54 |
-
│ ├── assets/
|
| 55 |
-
│ └── api/
|
| 56 |
-
├── scripts/
|
| 57 |
-
├── tests/
|
| 58 |
├── Dockerfile
|
| 59 |
├── docker-compose.yml
|
| 60 |
└── README.md
|
| 61 |
```
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
### 4.1 混合式分块 (Hybrid Segmentation)
|
| 66 |
|
| 67 |
-
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
|
| 75 |
### 4.2 信息提取 (Extraction)
|
| 76 |
-
|
| 77 |
-
对于每一个 `final_chunk`:
|
| 78 |
-
|
| 79 |
-
```python
|
| 80 |
-
extract_keyframes(chunk) # 提取关键帧
|
| 81 |
-
run_ocr_on_frames(frames) # OCR 识别
|
| 82 |
-
extract_and_transcribe_audio(chunk) # ASR 转录
|
| 83 |
-
```
|
| 84 |
|
| 85 |
### 4.3 图文融合与生成 (Processing)
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
synchronize_text_and_frames(subtitles, frames) # 字幕与图像对齐
|
| 89 |
-
generate_note_chunk(synchronized_data) # LLM 生成笔记
|
| 90 |
-
```
|
| 91 |
|
| 92 |
### 4.4 笔记合成 (Synthesis)
|
|
|
|
| 93 |
|
| 94 |
-
|
| 95 |
-
- 导出为最终的 Markdown 文件。
|
| 96 |
-
|
| 97 |
-
### 4.5 API
|
| 98 |
|
| 99 |
-
| 方法
|
| 100 |
-
|
| 101 |
-
| `POST
|
| 102 |
-
| `GET
|
| 103 |
-
| `GET
|
| 104 |
|
| 105 |
-
### 4.6
|
| 106 |
|
| 107 |
-
| 模块
|
| 108 |
-
|
| 109 |
-
| `ingestion.
|
| 110 |
-
| `
|
| 111 |
-
| `vision.
|
| 112 |
-
| `vision.
|
| 113 |
-
| `segmentation.
|
| 114 |
-
| `segmentation.
|
| 115 |
-
| `processing.processor`
|
| 116 |
-
| `synthesis.assembler`
|
| 117 |
|
| 118 |
### 4.7 数据格式示例
|
| 119 |
-
|
| 120 |
-
```jsonc
|
| 121 |
-
// subtitles.jsonl(节选)
|
| 122 |
-
{"start": 0.0, "end": 3.2, "text": "欢迎来到 Lec2Note 课程"}
|
| 123 |
-
{"start": 3.2, "end": 6.7, "text": "今天我们介绍多模态笔记生成"}
|
| 124 |
-
```
|
| 125 |
-
|
| 126 |
-
```jsonc
|
| 127 |
-
// chunk_schema.json(节选)
|
| 128 |
{
|
| 129 |
-
"id": 1,
|
| 130 |
"start": 0.0,
|
| 131 |
-
"end":
|
| 132 |
-
"
|
| 133 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
}
|
| 135 |
```
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
"images": ["kf_0001.png"]
|
| 142 |
-
}
|
| 143 |
-
```
|
| 144 |
|
| 145 |
-
|
| 146 |
|
|
|
|
| 147 |
```bash
|
| 148 |
# 克隆仓库
|
| 149 |
git clone [email protected]:your_org/lec2note.git
|
|
@@ -162,22 +145,18 @@ export OPENAI_API_KEY="YOUR_KEY"
|
|
| 162 |
pytest -q
|
| 163 |
```
|
| 164 |
|
| 165 |
-
### 快速运行本地
|
| 166 |
-
|
| 167 |
```bash
|
| 168 |
python -m lec2note.scripts.run_pipeline \
|
| 169 |
--video example.mp4 \
|
| 170 |
--output notes.md
|
| 171 |
```
|
| 172 |
|
|
|
|
|
|
|
| 173 |
## 6. 部署指南
|
| 174 |
|
| 175 |
### 6.1 Docker Compose
|
| 176 |
-
|
| 177 |
```bash
|
| 178 |
docker compose up -d --build
|
| 179 |
-
```
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
| 1 |
+
# Lec2Note 开发者指南 (Developer Guide)
|
| 2 |
|
| 3 |
## 1. 项目概述
|
| 4 |
+
Lec2Note 致力于提供一个端到端的 **“视频讲座自动生成笔记”** 解决方案。
|
| 5 |
+
通过多模态分析技术,深度融合视频画面与音频内容,生成 **图文并茂、结构清晰** 的笔记。
|
| 6 |
+
|
| 7 |
+
### 1.1 核心流程
|
| 8 |
+
1. **基于字幕的精细分块**:将 ASR 生成的每一句话视为独立的 *micro-chunk*,并在句末时间点精确截取关键帧。
|
| 9 |
+
2. **分层合并策略**
|
| 10 |
+
- **阶段一·视觉预合并**:仅根据视觉相似度合并连续 *micro-chunks*。
|
| 11 |
+
- **阶段二·语义合并**:在视觉合并结果上再根据文本语义相似度合并。
|
| 12 |
+
3. **多模态信息提取与采样**
|
| 13 |
+
- **文本**:拼接主题块包含的全部字幕文本。
|
| 14 |
+
- **图像**:采样该主题块的关键帧(最多 6 张)。
|
| 15 |
+
4. **分块笔记生成**:调用多模态 LLM,为每个主题块生成独立笔记。
|
| 16 |
+
5. **全局笔记合成**:再次调用 LLM,对所有分块笔记进行整合、去重与润色。
|
| 17 |
+
|
| 18 |
+
### 1.2 目标使用场景
|
| 19 |
- 学术课程录播
|
| 20 |
+
- 会议 / 研讨会记录
|
| 21 |
+
- 企业内部培训视频
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
|
| 25 |
## 2. 技术栈
|
| 26 |
|
| 27 |
+
| 层级 | 主要技术 | 说明 |
|
| 28 |
+
|------|----------|------|
|
| 29 |
+
| **语言** | Python 3.9+ | 主代码基于 Python |
|
| 30 |
+
| **视频 / 图像处理** | OpenCV, Pillow | 截图、图像处理、图像相似度计算 (SSIM / pHash) |
|
| 31 |
+
| **OCR** | PaddleOCR / Tesseract | 提取关键帧中的幻灯片文字 |
|
| 32 |
+
| **ASR** | Whisper / Faster-Whisper | 生成句子级时间戳 |
|
| 33 |
+
| **语义分析** | Sentence Transformers | 计算文本语义相似度 |
|
| 34 |
+
| **LLM** | Gemini-2.5-pro | 多模态模型,生成笔记 |
|
| 35 |
+
| **Web 框架** | FastAPI | 提供 RESTful & WebSocket 服务 |
|
| 36 |
+
| **任务编排** | Prefect / Celery | 批处理与重试机制 |
|
| 37 |
+
| **数据库** | SQLite (dev) / PostgreSQL (prod) | 存储元数据与任务状态 |
|
| 38 |
+
| **容器** | Docker & Docker Compose | 一键部署 |
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
|
| 42 |
## 3. 目录结构与模块划分
|
|
|
|
| 43 |
```text
|
| 44 |
Lec2Note/
|
| 45 |
+
├── docs/ # 设计文档 & 会议记录
|
| 46 |
+
├── lec2note/ # 源码包 (Python)
|
| 47 |
+
│ ├── ingestion/ # 音频处理 & ASR
|
| 48 |
+
│ ├── vision/ # 视频画面处理模块
|
| 49 |
+
│ │ ├── frame_extractor.py
|
| 50 |
+
│ │ ├── image_comparator.py
|
| 51 |
+
│ │ ├── image_sampler.py
|
| 52 |
│ │ └── ocr_processor.py
|
| 53 |
+
│ ├── segmentation/ # 分块与合并模块
|
| 54 |
+
│ │ ├── sentence_chunker.py
|
| 55 |
+
│ │ └── chunk_merger.py
|
| 56 |
+
│ ├── processing/ # 多模态信息融合与 LLM 生成
|
| 57 |
+
│ ├── synthesis/ # 全局笔记整合与导出
|
| 58 |
+
│ ├── assets/ # 静态模板 (Markdown/HTML)
|
| 59 |
+
│ └── api/ # FastAPI 路由
|
| 60 |
+
├── scripts/ # CLI 脚本 & 任务调度
|
| 61 |
+
├── tests/ # PyTest 单元与集成测试
|
| 62 |
├── Dockerfile
|
| 63 |
├── docker-compose.yml
|
| 64 |
└── README.md
|
| 65 |
```
|
| 66 |
|
| 67 |
+
---
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
## 4. 核心功能说明
|
| 70 |
|
| 71 |
+
### 4.1 分块与合并策略
|
| 72 |
+
1. **生成句块**:`sentence_chunker.run()` 根据 ASR 输出生成 `{start, end, text, keyframe_path}` 的 *sentence_chunks*。
|
| 73 |
+
2. **分层合并**:`chunk_merger.run()` 先视觉预合并,再语义合并,得到 *topic_chunks*。
|
| 74 |
+
3. **关键帧采样**:`image_sampler.sample()` 均匀采样关键帧,最多保留 6 张。
|
| 75 |
+
4. **输出**:最终得到包含文本及代表性截图的 `final_chunks`。
|
| 76 |
|
| 77 |
### 4.2 信息提取 (Extraction)
|
| 78 |
+
对采样后的截图进行 OCR,丰富文本信息。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
### 4.3 图文融合与生成 (Processing)
|
| 81 |
+
- **Prompt 构建**:在文本中插入 `[IMAGE_n]` 占位符,并附上对应图像列表。
|
| 82 |
+
- **LLM 调用**:`processor.generate_note_chunk()` 使用多模态 LLM 生成 Markdown 格式笔记。
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
### 4.4 笔记合成 (Synthesis)
|
| 85 |
+
`assembler.merge()` 收集所有 `note_chunk` 文本,构建新 Prompt,调用 LLM 进行去重、重排与润色,输出完整笔记。
|
| 86 |
|
| 87 |
+
### 4.5 API 一览
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
| 方法 | 路径 | 功能 |
|
| 90 |
+
|------|------|------|
|
| 91 |
+
| `POST` | `/upload` | 上传视频 → 返回任务 ID |
|
| 92 |
+
| `GET` | `/status/{id}` | 查询任务进度 |
|
| 93 |
+
| `GET` | `/notes/{id}` | 获取生成的图文笔记 |
|
| 94 |
|
| 95 |
+
### 4.6 内部模块接口
|
| 96 |
|
| 97 |
+
| 模块 | 关键类 / 方法 | 输入 | 输出 | 说明 |
|
| 98 |
+
|------|--------------|------|------|------|
|
| 99 |
+
| `ingestion.whisper_runner` | `WhisperRunner.transcribe()` | 音频路径 | `[{start, end, text}, …]` | 句子级 ASR 结果 |
|
| 100 |
+
| `vision.frame_extractor` | `FrameExtractor.capture_at()` | 视频路径, 时间戳列表 | 图片路径列表 | 精确截图 |
|
| 101 |
+
| `vision.image_comparator` | `ImageComparator.get_similarity()` | 两张图片路径 | 相似度 (0-1) | pHash / SSIM |
|
| 102 |
+
| `vision.image_sampler` | `ImageSampler.sample()` | 图片路径列表, `max_n` | 采样后路径列表 | 均匀采样 |
|
| 103 |
+
| `segmentation.sentence_chunker` | `SentenceChunker.run()` | 字幕列表, 视频路径 | *sentence_chunks* | 生成微型块 |
|
| 104 |
+
| `segmentation.chunk_merger` | `ChunkMerger.run()` | *sentence_chunks* | `final_chunks` | 分层合并 |
|
| 105 |
+
| `processing.processor` | `Processor.generate_note()` | `final_chunk` | `NoteChunk` | 调用 LLM |
|
| 106 |
+
| `synthesis.assembler` | `Assembler.merge()` | `NoteChunk` 列表 | Markdown/HTML | 全局整合 |
|
| 107 |
|
| 108 |
### 4.7 数据格式示例
|
| 109 |
+
```json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
{
|
|
|
|
| 111 |
"start": 0.0,
|
| 112 |
+
"end": 25.4,
|
| 113 |
+
"text": "欢迎来到 Lec2Note 课程。今天我们介绍多模态笔记生成。首先,我们会讲解系统的核心流程...",
|
| 114 |
+
"representative_frames": [
|
| 115 |
+
"frames/kf_3.20s.png",
|
| 116 |
+
"frames/kf_15.80s.png",
|
| 117 |
+
"frames/kf_22.10s.png"
|
| 118 |
+
]
|
| 119 |
}
|
| 120 |
```
|
| 121 |
|
| 122 |
+
### 4.8 健壮性:错误处理与重试
|
| 123 |
+
- **任务原子性**:每步定义为独立任务。
|
| 124 |
+
- **自动重试**:针对网络/LLM 失败采用指数退避重试。
|
| 125 |
+
- **失败隔离**:单任务失败不会阻断整体流程,可记录后续排查。
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
---
|
| 128 |
|
| 129 |
+
## 5. 开发环境搭建
|
| 130 |
```bash
|
| 131 |
# 克隆仓库
|
| 132 |
git clone [email protected]:your_org/lec2note.git
|
|
|
|
| 145 |
pytest -q
|
| 146 |
```
|
| 147 |
|
| 148 |
+
### 5.1 快速运行本地 Pipeline
|
|
|
|
| 149 |
```bash
|
| 150 |
python -m lec2note.scripts.run_pipeline \
|
| 151 |
--video example.mp4 \
|
| 152 |
--output notes.md
|
| 153 |
```
|
| 154 |
|
| 155 |
+
---
|
| 156 |
+
|
| 157 |
## 6. 部署指南
|
| 158 |
|
| 159 |
### 6.1 Docker Compose
|
|
|
|
| 160 |
```bash
|
| 161 |
docker compose up -d --build
|
| 162 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
lec2note/__pycache__/types.cpython-310.pyc
CHANGED
|
Binary files a/lec2note/__pycache__/types.cpython-310.pyc and b/lec2note/__pycache__/types.cpython-310.pyc differ
|
|
|
lec2note/api/main.py
CHANGED
|
@@ -18,7 +18,8 @@ from lec2note.ingestion.audio_extractor import AudioExtractor
|
|
| 18 |
from lec2note.ingestion.whisper_runner import WhisperRunner
|
| 19 |
from lec2note.segmentation.visual_segmenter import VisualSegmenter
|
| 20 |
from lec2note.segmentation.semantic_segmenter import SemanticSegmenter
|
| 21 |
-
|
|
|
|
| 22 |
from lec2note.vision.ocr_processor import OcrProcessor
|
| 23 |
from lec2note.processing.processor import Processor
|
| 24 |
from lec2note.synthesis.assembler import Assembler
|
|
@@ -41,23 +42,14 @@ def _run_pipeline(job_id: str, video_path: Path):
|
|
| 41 |
job["status"] = "asr"
|
| 42 |
subtitles = WhisperRunner.transcribe(wav)
|
| 43 |
|
| 44 |
-
job["status"] = "
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
job["status"] = "semantic_refine"
|
| 48 |
-
final_chunks_dict = SemanticSegmenter.refine(slide_chunks, subtitles)
|
| 49 |
-
|
| 50 |
-
# attach images to chunks
|
| 51 |
-
keyframes = KeyframeExtractor.run(video_path)
|
| 52 |
-
final_chunks: list[FinalChunk] = []
|
| 53 |
-
for ch in final_chunks_dict:
|
| 54 |
-
fc = FinalChunk(start=ch["start"], end=ch["end"], images=keyframes)
|
| 55 |
-
final_chunks.append(fc)
|
| 56 |
|
| 57 |
job["status"] = "ocr"
|
| 58 |
-
# run OCR for all
|
| 59 |
-
for
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
job["status"] = "generate_notes"
|
| 63 |
note_chunks = [Processor.generate_note(fc, subtitles) for fc in final_chunks]
|
|
|
|
| 18 |
from lec2note.ingestion.whisper_runner import WhisperRunner
|
| 19 |
from lec2note.segmentation.visual_segmenter import VisualSegmenter
|
| 20 |
from lec2note.segmentation.semantic_segmenter import SemanticSegmenter
|
| 21 |
+
# new merger handles images; no global keyframe extraction
|
| 22 |
+
from lec2note.segmentation.chunk_merger import ChunkMerger
|
| 23 |
from lec2note.vision.ocr_processor import OcrProcessor
|
| 24 |
from lec2note.processing.processor import Processor
|
| 25 |
from lec2note.synthesis.assembler import Assembler
|
|
|
|
| 42 |
job["status"] = "asr"
|
| 43 |
subtitles = WhisperRunner.transcribe(wav)
|
| 44 |
|
| 45 |
+
job["status"] = "chunk_merging"
|
| 46 |
+
final_chunks = ChunkMerger.run(subtitles, video_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
job["status"] = "ocr"
|
| 49 |
+
# run OCR for all images in final_chunks
|
| 50 |
+
# for fc in final_chunks:
|
| 51 |
+
# for img in fc.images:
|
| 52 |
+
# OcrProcessor.run(img)
|
| 53 |
|
| 54 |
job["status"] = "generate_notes"
|
| 55 |
note_chunks = [Processor.generate_note(fc, subtitles) for fc in final_chunks]
|
lec2note/ingestion/__pycache__/audio_extractor.cpython-310.pyc
CHANGED
|
Binary files a/lec2note/ingestion/__pycache__/audio_extractor.cpython-310.pyc and b/lec2note/ingestion/__pycache__/audio_extractor.cpython-310.pyc differ
|
|
|
lec2note/ingestion/__pycache__/audio_extractor.cpython-313.pyc
ADDED
|
Binary file (3.69 kB). View file
|
|
|
lec2note/ingestion/__pycache__/whisper_runner.cpython-313.pyc
ADDED
|
Binary file (2.65 kB). View file
|
|
|
lec2note/ingestion/audio_extractor.py
CHANGED
|
@@ -55,7 +55,7 @@ class AudioExtractor:
|
|
| 55 |
Returns
|
| 56 |
-------
|
| 57 |
Path
|
| 58 |
-
生成的
|
| 59 |
"""
|
| 60 |
|
| 61 |
video_path = Path(video_fp).expanduser().resolve()
|
|
@@ -65,7 +65,9 @@ class AudioExtractor:
|
|
| 65 |
|
| 66 |
out_dir = Path(output_dir or video_path.parent).expanduser().resolve()
|
| 67 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 68 |
-
|
|
|
|
|
|
|
| 69 |
|
| 70 |
# FFmpeg command
|
| 71 |
cmd = [
|
|
|
|
| 55 |
Returns
|
| 56 |
-------
|
| 57 |
Path
|
| 58 |
+
生成的 WAV 文件路径,文件名与输入视频同名。
|
| 59 |
"""
|
| 60 |
|
| 61 |
video_path = Path(video_fp).expanduser().resolve()
|
|
|
|
| 65 |
|
| 66 |
out_dir = Path(output_dir or video_path.parent).expanduser().resolve()
|
| 67 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
# Use the same filename as the video but with .wav extension
|
| 70 |
+
audio_path = out_dir / f"{video_path.stem}.wav"
|
| 71 |
|
| 72 |
# FFmpeg command
|
| 73 |
cmd = [
|
lec2note/processing/__pycache__/processor.cpython-310.pyc
CHANGED
|
Binary files a/lec2note/processing/__pycache__/processor.cpython-310.pyc and b/lec2note/processing/__pycache__/processor.cpython-310.pyc differ
|
|
|
lec2note/processing/processor.py
CHANGED
|
@@ -54,7 +54,7 @@ class Processor: # noqa: D101
|
|
| 54 |
# insert numbered placeholders into subtitles for reference
|
| 55 |
placeholder_subs = subtitle_text
|
| 56 |
for idx, _ in enumerate(synced["images"], start=1):
|
| 57 |
-
placeholder_subs += f"\n\n[IMG{idx}]
|
| 58 |
|
| 59 |
# Prompt with explicit mapping guidance
|
| 60 |
prompt_text = (
|
|
|
|
| 54 |
# insert numbered placeholders into subtitles for reference
|
| 55 |
placeholder_subs = subtitle_text
|
| 56 |
for idx, _ in enumerate(synced["images"], start=1):
|
| 57 |
+
placeholder_subs += f"\n\n[IMG{idx}]"
|
| 58 |
|
| 59 |
# Prompt with explicit mapping guidance
|
| 60 |
prompt_text = (
|
lec2note/scripts/__pycache__/run_pipeline.cpython-310.pyc
CHANGED
|
Binary files a/lec2note/scripts/__pycache__/run_pipeline.cpython-310.pyc and b/lec2note/scripts/__pycache__/run_pipeline.cpython-310.pyc differ
|
|
|
lec2note/scripts/__pycache__/run_pipeline.cpython-313.pyc
ADDED
|
Binary file (2.56 kB). View file
|
|
|
lec2note/scripts/run_pipeline.py
CHANGED
|
@@ -15,10 +15,9 @@ from lec2note.utils.logging_config import setup_logging
|
|
| 15 |
from lec2note.ingestion.whisper_runner import WhisperRunner
|
| 16 |
from lec2note.segmentation.visual_segmenter import VisualSegmenter
|
| 17 |
from lec2note.segmentation.semantic_segmenter import SemanticSegmenter
|
| 18 |
-
from lec2note.
|
| 19 |
from lec2note.processing.processor import Processor
|
| 20 |
from lec2note.synthesis.assembler import Assembler
|
| 21 |
-
from lec2note.types import FinalChunk
|
| 22 |
|
| 23 |
|
| 24 |
def main(): # noqa: D401
|
|
@@ -35,14 +34,7 @@ def main(): # noqa: D401
|
|
| 35 |
wav = AudioExtractor.extract(video_path)
|
| 36 |
subtitles = WhisperRunner.transcribe(wav)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
final_chunks_dict = SemanticSegmenter.refine(slide_chunks, subtitles)
|
| 40 |
-
|
| 41 |
-
keyframes = KeyframeExtractor.run(video_path)
|
| 42 |
-
final_chunks: list[FinalChunk] = []
|
| 43 |
-
for ch in final_chunks_dict:
|
| 44 |
-
fc = FinalChunk(start=ch["start"], end=ch["end"], images=keyframes)
|
| 45 |
-
final_chunks.append(fc)
|
| 46 |
|
| 47 |
note_chunks = [Processor.generate_note(fc, subtitles) for fc in final_chunks]
|
| 48 |
markdown = Assembler.merge(note_chunks)
|
|
|
|
| 15 |
from lec2note.ingestion.whisper_runner import WhisperRunner
|
| 16 |
from lec2note.segmentation.visual_segmenter import VisualSegmenter
|
| 17 |
from lec2note.segmentation.semantic_segmenter import SemanticSegmenter
|
| 18 |
+
from lec2note.segmentation.chunk_merger import ChunkMerger
|
| 19 |
from lec2note.processing.processor import Processor
|
| 20 |
from lec2note.synthesis.assembler import Assembler
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def main(): # noqa: D401
|
|
|
|
| 34 |
wav = AudioExtractor.extract(video_path)
|
| 35 |
subtitles = WhisperRunner.transcribe(wav)
|
| 36 |
|
| 37 |
+
final_chunks = ChunkMerger.run(subtitles, video_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
note_chunks = [Processor.generate_note(fc, subtitles) for fc in final_chunks]
|
| 40 |
markdown = Assembler.merge(note_chunks)
|
lec2note/segmentation/__pycache__/chunk_merger.cpython-310.pyc
ADDED
|
Binary file (2.02 kB). View file
|
|
|
lec2note/segmentation/__pycache__/semantic_segmenter.cpython-310.pyc
CHANGED
|
Binary files a/lec2note/segmentation/__pycache__/semantic_segmenter.cpython-310.pyc and b/lec2note/segmentation/__pycache__/semantic_segmenter.cpython-310.pyc differ
|
|
|
lec2note/segmentation/__pycache__/sentence_chunker.cpython-310.pyc
ADDED
|
Binary file (2.03 kB). View file
|
|
|
lec2note/segmentation/chunk_merger.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""Hierarchical chunk merger implementing the strategy described in DEVELOPER_GUIDE.md.
|
| 4 |
+
|
| 5 |
+
Steps
|
| 6 |
+
-----
|
| 7 |
+
1. *Visual pre-merge* – use :pyclass:`~lec2note.segmentation.visual_segmenter.VisualSegmenter`
|
| 8 |
+
to obtain slide-level chunks purely based on keyframe similarity.
|
| 9 |
+
2. *Semantic merge* – further merge / split those chunks according to subtitle
|
| 10 |
+
semantic similarity via :pyclass:`~lec2note.segmentation.semantic_segmenter.SemanticSegmenter`.
|
| 11 |
+
3. *Image sampling* – collect all keyframes belonging to each final topic chunk
|
| 12 |
+
and uniformly sample at most **6** images using
|
| 13 |
+
:pyclass:`~lec2note.vision.image_sampler.ImageSampler`.
|
| 14 |
+
|
| 15 |
+
The output is a list of :pyclass:`lec2note.types.FinalChunk` dataclass instances
|
| 16 |
+
which are ready for downstream multimodal processing.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
# refactored: use VisualMerger instead of VisualSegmenter
|
| 20 |
+
import logging
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import List, Dict
|
| 23 |
+
|
| 24 |
+
from lec2note.segmentation.visual_merger import VisualMerger
|
| 25 |
+
from lec2note.segmentation.semantic_segmenter import SemanticSegmenter
|
| 26 |
+
from lec2note.segmentation.sentence_chunker import SentenceChunker
|
| 27 |
+
from lec2note.types import FinalChunk
|
| 28 |
+
from lec2note.vision.image_sampler import ImageSampler
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
__all__ = ["ChunkMerger"]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class ChunkMerger: # noqa: D101
|
| 36 |
+
@classmethod
|
| 37 |
+
def run(
|
| 38 |
+
cls,
|
| 39 |
+
subtitles: List[Dict],
|
| 40 |
+
video_fp: str | Path,
|
| 41 |
+
) -> List[FinalChunk]:
|
| 42 |
+
"""Return list of topic-level FinalChunk objects ready for note generation."""
|
| 43 |
+
video_path = Path(video_fp).expanduser().resolve()
|
| 44 |
+
logger.info("[ChunkMerger] start merging pipeline on %s", video_path.name)
|
| 45 |
+
|
| 46 |
+
# 1. micro-chunks with keyframes
|
| 47 |
+
micro_chunks = SentenceChunker.run(subtitles, video_path)
|
| 48 |
+
|
| 49 |
+
# 2. visual merge (merge micro_chunks by image similarity)
|
| 50 |
+
visual_chunks = VisualMerger.merge(micro_chunks)
|
| 51 |
+
|
| 52 |
+
# 3. semantic merge – refine by subtitle semantics
|
| 53 |
+
topic_chunks_dict = SemanticSegmenter.refine(visual_chunks, subtitles)
|
| 54 |
+
|
| 55 |
+
# 4. map micro to topic & sample images
|
| 56 |
+
final_chunks: List[FinalChunk] = []
|
| 57 |
+
for ch in topic_chunks_dict:
|
| 58 |
+
imgs = [mc["keyframe_path"] for mc in micro_chunks if ch["start"] <= mc["start"] < ch["end"]]
|
| 59 |
+
imgs_sampled = ImageSampler.sample(imgs, max_n=6)
|
| 60 |
+
fc = FinalChunk(start=ch["start"], end=ch["end"], images=[Path(p) for p in imgs_sampled])
|
| 61 |
+
final_chunks.append(fc)
|
| 62 |
+
logger.info("[ChunkMerger] produced %d final topic chunks", len(final_chunks))
|
| 63 |
+
return final_chunks
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
lec2note/segmentation/sentence_chunker.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""Generate sentence-level micro-chunks from subtitles with keyframes.
|
| 4 |
+
|
| 5 |
+
This module takes the subtitle list (each element a dict with ``start``, ``end`` and
|
| 6 |
+
``text`` fields) together with the original video path, and produces a list of
|
| 7 |
+
micro-chunks. A micro-chunk is a dict containing:
|
| 8 |
+
|
| 9 |
+
* ``start`` – float seconds
|
| 10 |
+
* ``end`` – float seconds
|
| 11 |
+
* ``text`` – sentence text
|
| 12 |
+
* ``keyframe_path`` – saved image path of the representative frame captured at
|
| 13 |
+
``end`` timestamp of the sentence. This single frame will later be used by
|
| 14 |
+
image-level integration modules.
|
| 15 |
+
|
| 16 |
+
The frame capture is delegated to :pyfunc:`lec2note.vision.frame_extractor.FrameExtractor.capture_at`.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import logging
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import List, Dict
|
| 22 |
+
|
| 23 |
+
from lec2note.vision.frame_extractor import FrameExtractor
|
| 24 |
+
|
| 25 |
+
__all__ = ["SentenceChunker"]
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class SentenceChunker: # noqa: D101
|
| 31 |
+
@classmethod
|
| 32 |
+
def run(
|
| 33 |
+
cls,
|
| 34 |
+
subtitles: List[Dict],
|
| 35 |
+
video_fp: str | Path,
|
| 36 |
+
*,
|
| 37 |
+
output_dir: str | Path | None = None,
|
| 38 |
+
) -> List[Dict]:
|
| 39 |
+
"""Generate micro-chunks aligned with subtitle sentences.
|
| 40 |
+
|
| 41 |
+
Parameters
|
| 42 |
+
----------
|
| 43 |
+
subtitles
|
| 44 |
+
List of subtitle dicts from ASR with ``start``, ``end``, ``text`` keys.
|
| 45 |
+
video_fp
|
| 46 |
+
Path to input video.
|
| 47 |
+
output_dir
|
| 48 |
+
Directory to store extracted keyframes. If *None*, a ``frames``
|
| 49 |
+
sub-directory next to the video file is used.
|
| 50 |
+
"""
|
| 51 |
+
video_path = Path(video_fp).expanduser().resolve()
|
| 52 |
+
if not video_path.exists():
|
| 53 |
+
raise FileNotFoundError(video_path)
|
| 54 |
+
|
| 55 |
+
micro_chunks: List[Dict] = []
|
| 56 |
+
timestamps: List[float] = [s["end"] for s in subtitles]
|
| 57 |
+
keyframe_paths = FrameExtractor.capture_at(video_path, timestamps, output_dir=output_dir)
|
| 58 |
+
# ensure same length
|
| 59 |
+
if len(keyframe_paths) != len(subtitles):
|
| 60 |
+
logger.warning(
|
| 61 |
+
"[SentenceChunker] expected %d keyframes but got %d",
|
| 62 |
+
len(subtitles),
|
| 63 |
+
len(keyframe_paths),
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
for idx, sub in enumerate(subtitles):
|
| 67 |
+
chunk = {
|
| 68 |
+
"start": sub["start"],
|
| 69 |
+
"end": sub["end"],
|
| 70 |
+
"text": sub["text"],
|
| 71 |
+
"keyframe_path": str(keyframe_paths[idx]) if idx < len(keyframe_paths) else "",
|
| 72 |
+
}
|
| 73 |
+
micro_chunks.append(chunk)
|
| 74 |
+
logger.info("[SentenceChunker] generated %d micro-chunks", len(micro_chunks))
|
| 75 |
+
return micro_chunks
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
lec2note/segmentation/visual_merger.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""Merge adjacent sentence-level chunks by visual similarity.
|
| 4 |
+
|
| 5 |
+
Input: *micro_chunks* – list of dicts from SentenceChunker, each with
|
| 6 |
+
``start``, ``end``, ``text`` and ``keyframe_path``.
|
| 7 |
+
|
| 8 |
+
Algorithm:
|
| 9 |
+
1. Iterate in temporal order.
|
| 10 |
+
2. Compare keyframe of current sentence with keyframe of *buffer* (last kept
|
| 11 |
+
micro chunk of current visual block) using
|
| 12 |
+
:pyfunc:`lec2note.vision.image_comparator.ImageComparator.get_similarity`.
|
| 13 |
+
3. If similarity ≥ threshold (default 0.9) → merge (extend ``end`` of buffer),
|
| 14 |
+
else flush buffer to output and start new buffer.
|
| 15 |
+
4. After merge, the **only keyframe kept for a visual block is that of the
|
| 16 |
+
*last* sentence**, naturally satisfied because buffer always holds last
|
| 17 |
+
sentence's keyframe.
|
| 18 |
+
|
| 19 |
+
Return: list of ``{start, end}`` dicts representing visual-level chunks, ready
|
| 20 |
+
for semantic refinement.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import logging
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from typing import List, Dict
|
| 26 |
+
|
| 27 |
+
from lec2note.vision.image_comparator import ImageComparator
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
__all__ = ["VisualMerger"]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class VisualMerger: # noqa: D101
|
| 35 |
+
@classmethod
|
| 36 |
+
def merge(
|
| 37 |
+
cls,
|
| 38 |
+
micro_chunks: List[Dict],
|
| 39 |
+
*,
|
| 40 |
+
sim_threshold: float = 0.9,
|
| 41 |
+
) -> List[Dict]:
|
| 42 |
+
if not micro_chunks:
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
visual_chunks: List[Dict] = []
|
| 46 |
+
buffer = micro_chunks[0].copy()
|
| 47 |
+
for mc in micro_chunks[1:]:
|
| 48 |
+
# compare buffer keyframe (last sentence in current block) with mc keyframe
|
| 49 |
+
try:
|
| 50 |
+
sim = ImageComparator.get_similarity(buffer["keyframe_path"], mc["keyframe_path"])
|
| 51 |
+
except Exception as exc: # noqa: BLE001
|
| 52 |
+
logger.warning("[VisualMerger] similarity calc failed: %s", exc)
|
| 53 |
+
sim = 0.0 # force split
|
| 54 |
+
if sim >= sim_threshold:
|
| 55 |
+
# merge: extend end and replace keyframe/path to current (last)
|
| 56 |
+
buffer["end"] = mc["end"]
|
| 57 |
+
buffer["keyframe_path"] = mc["keyframe_path"]
|
| 58 |
+
else:
|
| 59 |
+
visual_chunks.append({"start": buffer["start"], "end": buffer["end"]})
|
| 60 |
+
buffer = mc.copy()
|
| 61 |
+
visual_chunks.append({"start": buffer["start"], "end": buffer["end"]})
|
| 62 |
+
logger.info("[VisualMerger] merged %d micro → %d visual chunks", len(micro_chunks), len(visual_chunks))
|
| 63 |
+
return visual_chunks
|
lec2note/synthesis/__pycache__/assembler.cpython-310.pyc
CHANGED
|
Binary files a/lec2note/synthesis/__pycache__/assembler.cpython-310.pyc and b/lec2note/synthesis/__pycache__/assembler.cpython-310.pyc differ
|
|
|
lec2note/synthesis/assembler.py
CHANGED
|
@@ -28,7 +28,6 @@ class Assembler: # noqa: D101
|
|
| 28 |
raw_md = "\n\n".join(body_parts)
|
| 29 |
logger.info("[Assembler] merging %d note chunks", len(chunks))
|
| 30 |
|
| 31 |
-
# LLM 后期润色:可选,通过环境变量控制
|
| 32 |
logger.info("[Assembler] polishing with LLM…")
|
| 33 |
try:
|
| 34 |
if not os.getenv("OPENAI_API_KEY"):
|
|
@@ -60,11 +59,9 @@ class Assembler: # noqa: D101
|
|
| 60 |
}
|
| 61 |
],
|
| 62 |
)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
else:
|
| 67 |
-
polished = raw_md
|
| 68 |
|
| 69 |
logger.info("[Assembler] final document length %d chars", len(polished))
|
| 70 |
return TEMPLATE.format(content=polished)
|
|
|
|
| 28 |
raw_md = "\n\n".join(body_parts)
|
| 29 |
logger.info("[Assembler] merging %d note chunks", len(chunks))
|
| 30 |
|
|
|
|
| 31 |
logger.info("[Assembler] polishing with LLM…")
|
| 32 |
try:
|
| 33 |
if not os.getenv("OPENAI_API_KEY"):
|
|
|
|
| 59 |
}
|
| 60 |
],
|
| 61 |
)
|
| 62 |
+
polished = response.choices[0].message.content.strip()
|
| 63 |
+
except Exception: # noqa: BLE001
|
| 64 |
+
polished = raw_md
|
|
|
|
|
|
|
| 65 |
|
| 66 |
logger.info("[Assembler] final document length %d chars", len(polished))
|
| 67 |
return TEMPLATE.format(content=polished)
|
lec2note/utils/__pycache__/logging_config.cpython-313.pyc
ADDED
|
Binary file (1.08 kB). View file
|
|
|
lec2note/vision/__pycache__/frame_extractor.cpython-310.pyc
ADDED
|
Binary file (1.96 kB). View file
|
|
|
lec2note/vision/__pycache__/image_sampler.cpython-310.pyc
ADDED
|
Binary file (1.24 kB). View file
|
|
|
lec2note/vision/__pycache__/keyframe_extractor.cpython-310.pyc
CHANGED
|
Binary files a/lec2note/vision/__pycache__/keyframe_extractor.cpython-310.pyc and b/lec2note/vision/__pycache__/keyframe_extractor.cpython-310.pyc differ
|
|
|
lec2note/vision/frame_extractor.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""Lightweight frame extractor utility used by SentenceChunker.
|
| 4 |
+
|
| 5 |
+
This wrapper around OpenCV provides a single classmethod ``capture_at`` which
|
| 6 |
+
accepts a list of timestamps and saves each captured frame as PNG into the
|
| 7 |
+
specified directory. Returned value is the list of saved ``Path`` objects in
|
| 8 |
+
exactly the same order as input timestamps.
|
| 9 |
+
|
| 10 |
+
Unlike :pyfunc:`lec2note.vision.keyframe_extractor.KeyframeExtractor` which
|
| 11 |
+
searches the whole video to locate slide changes, this extractor is precise and
|
| 12 |
+
only grabs frames at given times; therefore it is computationally cheaper.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import logging
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import List
|
| 18 |
+
|
| 19 |
+
import cv2 # type: ignore
|
| 20 |
+
|
| 21 |
+
__all__ = ["FrameExtractor"]
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class FrameExtractor: # noqa: D101
|
| 27 |
+
@classmethod
|
| 28 |
+
def capture_at(
|
| 29 |
+
cls,
|
| 30 |
+
video_fp: str | Path,
|
| 31 |
+
timestamps: List[float],
|
| 32 |
+
*,
|
| 33 |
+
output_dir: str | Path | None = None,
|
| 34 |
+
image_prefix: str = "cap",
|
| 35 |
+
) -> List[Path]:
|
| 36 |
+
"""Capture video frames at given timestamps.
|
| 37 |
+
|
| 38 |
+
Parameters
|
| 39 |
+
----------
|
| 40 |
+
video_fp
|
| 41 |
+
Input video path.
|
| 42 |
+
timestamps
|
| 43 |
+
Seconds (float) where frames should be captured.
|
| 44 |
+
output_dir
|
| 45 |
+
Directory to store PNG images; default to ``frames`` next to video.
|
| 46 |
+
image_prefix
|
| 47 |
+
Prefix of output filenames.
|
| 48 |
+
"""
|
| 49 |
+
video_path = Path(video_fp).expanduser().resolve()
|
| 50 |
+
if not video_path.exists():
|
| 51 |
+
raise FileNotFoundError(video_path)
|
| 52 |
+
|
| 53 |
+
save_dir = Path(output_dir or video_path.parent / "frames").resolve()
|
| 54 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
| 55 |
+
|
| 56 |
+
cap = cv2.VideoCapture(str(video_path))
|
| 57 |
+
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
| 58 |
+
saved: List[Path] = []
|
| 59 |
+
for ts in timestamps:
|
| 60 |
+
frame_idx = int(ts * fps)
|
| 61 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
| 62 |
+
success, frame = cap.read()
|
| 63 |
+
if not success:
|
| 64 |
+
logger.warning("[FrameExtractor] failed reading frame at %.2fs", ts)
|
| 65 |
+
continue
|
| 66 |
+
out_fp = save_dir / f"{image_prefix}_{frame_idx:06d}.png"
|
| 67 |
+
cv2.imwrite(str(out_fp), frame)
|
| 68 |
+
saved.append(out_fp)
|
| 69 |
+
cap.release()
|
| 70 |
+
logger.info("[FrameExtractor] captured %d frames", len(saved))
|
| 71 |
+
return saved
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
lec2note/vision/image_comparator.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""Compute similarity between two images.
|
| 4 |
+
|
| 5 |
+
Two complementary metrics are provided:
|
| 6 |
+
|
| 7 |
+
* **SSIM** – structural similarity on grayscale images.
|
| 8 |
+
* **dHash distance** – perceptual hash Hamming distance.
|
| 9 |
+
|
| 10 |
+
The public API exposes a single :pyfunc:`ImageComparator.get_similarity` method
|
| 11 |
+
returning a float in \[0, 1\] where **1.0** means identical slides and **0.0**
|
| 12 |
+
means completely different. Internally a simple weighted combination of SSIM
|
| 13 |
+
and inverted-normalised dHash distance is used.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Tuple
|
| 18 |
+
|
| 19 |
+
import cv2 # type: ignore
|
| 20 |
+
import imagehash # type: ignore
|
| 21 |
+
from PIL import Image
|
| 22 |
+
from skimage.metrics import structural_similarity as ssim # type: ignore
|
| 23 |
+
|
| 24 |
+
__all__ = ["ImageComparator"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ImageComparator: # noqa: D101
|
| 28 |
+
@staticmethod
|
| 29 |
+
def _load_grayscale(fp: Path):
|
| 30 |
+
img = cv2.imread(str(fp))
|
| 31 |
+
if img is None:
|
| 32 |
+
raise FileNotFoundError(fp)
|
| 33 |
+
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 34 |
+
|
| 35 |
+
@classmethod
|
| 36 |
+
def _ssim(cls, fp1: Path, fp2: Path) -> float:
|
| 37 |
+
g1, g2 = cls._load_grayscale(fp1), cls._load_grayscale(fp2)
|
| 38 |
+
score = ssim(g1, g2)
|
| 39 |
+
return float(score)
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def _dhash_dist(fp1: Path, fp2: Path) -> int:
|
| 43 |
+
h1, h2 = imagehash.dhash(Image.open(fp1)), imagehash.dhash(Image.open(fp2))
|
| 44 |
+
return h1 - h2 # type: ignore[return-value]
|
| 45 |
+
|
| 46 |
+
@classmethod
|
| 47 |
+
def get_similarity(cls, fp1: str | Path, fp2: str | Path) -> float:
|
| 48 |
+
"""Return similarity in range [0, 1]. Higher is more similar."""
|
| 49 |
+
p1, p2 = Path(fp1).expanduser().resolve(), Path(fp2).expanduser().resolve()
|
| 50 |
+
ssim_val = cls._ssim(p1, p2)
|
| 51 |
+
dh_dist = cls._dhash_dist(p1, p2)
|
| 52 |
+
dh_norm = max(0.0, 1.0 - dh_dist / 64) # 64-bit hash
|
| 53 |
+
return 0.7 * ssim_val + 0.3 * dh_norm
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
|
lec2note/vision/image_sampler.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""Utility to uniformly sample a subset of images from a list.
|
| 4 |
+
|
| 5 |
+
Used to limit the number of representative keyframes per topic chunk to a small
|
| 6 |
+
constant (default 6) for efficient downstream multi-modal prompting.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List
|
| 11 |
+
|
| 12 |
+
__all__ = ["ImageSampler"]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ImageSampler: # noqa: D101
|
| 16 |
+
@staticmethod
|
| 17 |
+
def sample(paths: List[str | Path], max_n: int = 6) -> List[str]:
|
| 18 |
+
"""Return (at most) *max_n* paths evenly sampled from *paths* list."""
|
| 19 |
+
if len(paths) <= max_n:
|
| 20 |
+
return [str(Path(p)) for p in paths]
|
| 21 |
+
step = len(paths) / max_n
|
| 22 |
+
idxs = [int(i * step) for i in range(max_n)]
|
| 23 |
+
return [str(Path(paths[i])) for i in idxs]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
lec2note/vision/keyframe_extractor.py
CHANGED
|
@@ -15,6 +15,23 @@ import logging
|
|
| 15 |
from pathlib import Path
|
| 16 |
from typing import List
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
__all__ = ["KeyframeExtractor"]
|
| 19 |
|
| 20 |
|
|
@@ -67,6 +84,10 @@ class KeyframeExtractor:
|
|
| 67 |
frame_idx = 0
|
| 68 |
saved_paths: List[Path] = []
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
while True:
|
| 71 |
success, frame = cap.read()
|
| 72 |
if not success:
|
|
@@ -79,6 +100,9 @@ class KeyframeExtractor:
|
|
| 79 |
prev_frame = frame
|
| 80 |
|
| 81 |
frame_idx += 1
|
|
|
|
|
|
|
|
|
|
| 82 |
logging.getLogger(__name__).info("[KeyframeExtractor] saved %d keyframes to %s", len(saved_paths), save_dir)
|
| 83 |
|
| 84 |
cap.release()
|
|
|
|
| 15 |
from pathlib import Path
|
| 16 |
from typing import List
|
| 17 |
|
| 18 |
+
# optional progress bar
|
| 19 |
+
try:
|
| 20 |
+
from tqdm.auto import tqdm # type: ignore
|
| 21 |
+
except ImportError: # pragma: no cover
|
| 22 |
+
def tqdm(iterable=None, **kwargs): # type: ignore
|
| 23 |
+
"""Fallback tqdm when the package is not installed."""
|
| 24 |
+
if iterable is None:
|
| 25 |
+
class _Dummy: # noqa: D401
|
| 26 |
+
def update(self, n=1):
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
def close(self):
|
| 30 |
+
pass
|
| 31 |
+
|
| 32 |
+
return _Dummy()
|
| 33 |
+
return iterable
|
| 34 |
+
|
| 35 |
__all__ = ["KeyframeExtractor"]
|
| 36 |
|
| 37 |
|
|
|
|
| 84 |
frame_idx = 0
|
| 85 |
saved_paths: List[Path] = []
|
| 86 |
|
| 87 |
+
# progress bar setup
|
| 88 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or None
|
| 89 |
+
pbar = tqdm(total=total_frames, desc="[KeyframeExtractor] extracting", unit="frame")
|
| 90 |
+
|
| 91 |
while True:
|
| 92 |
success, frame = cap.read()
|
| 93 |
if not success:
|
|
|
|
| 100 |
prev_frame = frame
|
| 101 |
|
| 102 |
frame_idx += 1
|
| 103 |
+
pbar.update(1)
|
| 104 |
+
|
| 105 |
+
pbar.close()
|
| 106 |
logging.getLogger(__name__).info("[KeyframeExtractor] saved %d keyframes to %s", len(saved_paths), save_dir)
|
| 107 |
|
| 108 |
cap.release()
|