Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -99,39 +99,6 @@ def do_similarity(text_a: str, text_b: str, dims: int = DEFAULT_DIMS) -> float:
|
|
| 99 |
b = model.encode_document([text_b], normalize_embeddings=True, convert_to_numpy=True)[0][:dims]
|
| 100 |
return float(np.dot(a, b))
|
| 101 |
|
| 102 |
-
# Extractive summarization using EmbeddingGemma's Summarization prompt
|
| 103 |
-
def _split_sents(text: str):
|
| 104 |
-
parts = re.split(r"(?<=[\.!?])\s+", text.strip())
|
| 105 |
-
return [p.strip() for p in parts if p.strip()]
|
| 106 |
-
|
| 107 |
-
def summarize_extractive(text: str, n: int, dims: int, lambda_diversity: float = 0.7) -> str:
|
| 108 |
-
sents = _split_sents(text)
|
| 109 |
-
if not sents:
|
| 110 |
-
return ""
|
| 111 |
-
embs = model.encode(
|
| 112 |
-
sents,
|
| 113 |
-
prompt_name="Summarization",
|
| 114 |
-
normalize_embeddings=True,
|
| 115 |
-
convert_to_numpy=True,
|
| 116 |
-
batch_size=128,
|
| 117 |
-
)[:, :dims]
|
| 118 |
-
centroid = embs.mean(axis=0)
|
| 119 |
-
base = embs @ centroid
|
| 120 |
-
|
| 121 |
-
picked = []
|
| 122 |
-
for _ in range(min(n, len(sents))):
|
| 123 |
-
if not picked:
|
| 124 |
-
i = int(np.argmax(base))
|
| 125 |
-
else:
|
| 126 |
-
sim_to_sel = np.max(embs[picked] @ embs.T, axis=0)
|
| 127 |
-
mmr = (1 - lambda_diversity) * base + lambda_diversity * (1 - sim_to_sel)
|
| 128 |
-
i = int(np.argmax(mmr))
|
| 129 |
-
picked.append(i)
|
| 130 |
-
base[i] = -1e9
|
| 131 |
-
# keep original order
|
| 132 |
-
ordered = [s for _, s in sorted(zip(picked, [sents[i] for i in picked]))]
|
| 133 |
-
return " ".join(ordered)
|
| 134 |
-
|
| 135 |
# ---------- Gradio UI ----------
|
| 136 |
with gr.Blocks(title="EmbeddingGemma × Wikipedia (EN corpus)") as demo:
|
| 137 |
gr.Markdown(
|
|
@@ -144,7 +111,6 @@ with gr.Blocks(title="EmbeddingGemma × Wikipedia (EN corpus)") as demo:
|
|
| 144 |
- **Semantic search** (English queries)
|
| 145 |
- **Cross-lingual search** (queries in other languages → English articles)
|
| 146 |
- **Sentence similarity** (compare two texts)
|
| 147 |
-
- **Extractive summarization** (highlight key sentences from long text)
|
| 148 |
|
| 149 |
🔗 Learn more in the [EmbeddingGemma blog post](https://huggingface.co/blog/embeddinggemma).
|
| 150 |
"""
|
|
@@ -182,16 +148,5 @@ with gr.Blocks(title="EmbeddingGemma × Wikipedia (EN corpus)") as demo:
|
|
| 182 |
sim_out = gr.Number(label="Cosine similarity (-1..1)")
|
| 183 |
sim_btn.click(lambda x, y, d: do_similarity(x, y, int(d)), [a, b, dims2], sim_out)
|
| 184 |
|
| 185 |
-
# 4) Summarization (extractive)
|
| 186 |
-
with gr.TabItem("Summarization"):
|
| 187 |
-
gr.Markdown("**Extractive summarization** using EmbeddingGemma's `Summarization` prompt. Paste any long text.")
|
| 188 |
-
with gr.Row():
|
| 189 |
-
sum_dims = gr.Dropdown([str(d) for d in MATRYOSHKA_DIMS], value=str(DEFAULT_DIMS), label="Embedding dims")
|
| 190 |
-
sum_n = gr.Slider(1, 10, value=5, step=1, label="Sentences in summary")
|
| 191 |
-
sum_text = gr.Textbox(lines=12, label="Text to summarize", value="Paste a Wikipedia article (or any text) here…")
|
| 192 |
-
sum_btn = gr.Button("Summarize")
|
| 193 |
-
sum_out = gr.Textbox(lines=10, label="Summary")
|
| 194 |
-
sum_btn.click(lambda t, n, d: summarize_extractive(t, int(n), int(d)), [sum_text, sum_n, sum_dims], sum_out)
|
| 195 |
-
|
| 196 |
if __name__ == "__main__":
|
| 197 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 99 |
b = model.encode_document([text_b], normalize_embeddings=True, convert_to_numpy=True)[0][:dims]
|
| 100 |
return float(np.dot(a, b))
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
# ---------- Gradio UI ----------
|
| 103 |
with gr.Blocks(title="EmbeddingGemma × Wikipedia (EN corpus)") as demo:
|
| 104 |
gr.Markdown(
|
|
|
|
| 111 |
- **Semantic search** (English queries)
|
| 112 |
- **Cross-lingual search** (queries in other languages → English articles)
|
| 113 |
- **Sentence similarity** (compare two texts)
|
|
|
|
| 114 |
|
| 115 |
🔗 Learn more in the [EmbeddingGemma blog post](https://huggingface.co/blog/embeddinggemma).
|
| 116 |
"""
|
|
|
|
| 148 |
sim_out = gr.Number(label="Cosine similarity (-1..1)")
|
| 149 |
sim_btn.click(lambda x, y, d: do_similarity(x, y, int(d)), [a, b, dims2], sim_out)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
if __name__ == "__main__":
|
| 152 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|