Spaces:
Sleeping
Sleeping
fix: improve word selection and passage quality filtering
Browse files- Add passage quality checks to skip texts with excessive caps/numbers
- Update AI prompt to avoid malformed words and OCR errors
- Remove hardcoded word filtering in favor of AI-driven selection
- Increase max word length from 10 to 12 characters
- src/aiService.js +11 -2
- src/clozeGameEngine.js +42 -14
src/aiService.js
CHANGED
|
@@ -121,10 +121,19 @@ class OpenRouterService {
|
|
| 121 |
model: this.model,
|
| 122 |
messages: [{
|
| 123 |
role: 'system',
|
| 124 |
-
content: '
|
| 125 |
}, {
|
| 126 |
role: 'user',
|
| 127 |
-
content: `Select exactly ${count}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
Passage: "${passage}"`
|
| 130 |
}],
|
|
|
|
| 121 |
model: this.model,
|
| 122 |
messages: [{
|
| 123 |
role: 'system',
|
| 124 |
+
content: 'You are a vocabulary selector for educational cloze exercises. Select meaningful, properly-spelled content words that appear exactly as written in the passage.'
|
| 125 |
}, {
|
| 126 |
role: 'user',
|
| 127 |
+
content: `Select exactly ${count} words from this passage for a cloze exercise.
|
| 128 |
+
|
| 129 |
+
REQUIREMENTS:
|
| 130 |
+
- Choose clear, properly-spelled words (no OCR errors like "andsatires")
|
| 131 |
+
- Select meaningful nouns, verbs, or adjectives (4-12 letters)
|
| 132 |
+
- Words must appear EXACTLY as written in the passage
|
| 133 |
+
- Avoid: function words, archaic terms, proper nouns, technical jargon
|
| 134 |
+
- Skip any words that look malformed or concatenated
|
| 135 |
+
|
| 136 |
+
Return ONLY a JSON array of the selected words.
|
| 137 |
|
| 138 |
Passage: "${passage}"`
|
| 139 |
}],
|
src/clozeGameEngine.js
CHANGED
|
@@ -85,13 +85,17 @@ class ClozeGame {
|
|
| 85 |
const startFromMiddle = Math.floor(textLength * 0.3); // Skip first 30%
|
| 86 |
const endAtThreeQuarters = Math.floor(textLength * 0.8); // Stop before last 20%
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
// Clean up start - find first complete sentence that starts with capital letter
|
| 97 |
const firstSentenceMatch = passage.match(/[.!?]\s+([A-Z][^.!?]*)/);
|
|
@@ -106,12 +110,36 @@ class ClozeGame {
|
|
| 106 |
}
|
| 107 |
}
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
}
|
| 116 |
|
| 117 |
// Ensure minimum length - if too short, return what we have rather than infinite recursion
|
|
@@ -333,7 +361,7 @@ class ClozeGame {
|
|
| 333 |
const contentWordIndices = [];
|
| 334 |
words.forEach((word, index) => {
|
| 335 |
const cleanWord = word.toLowerCase().replace(/[^\w]/g, '');
|
| 336 |
-
if (cleanWord.length > 3 && cleanWord.length <=
|
| 337 |
contentWordIndices.push({ word: cleanWord, index });
|
| 338 |
}
|
| 339 |
});
|
|
|
|
| 85 |
const startFromMiddle = Math.floor(textLength * 0.3); // Skip first 30%
|
| 86 |
const endAtThreeQuarters = Math.floor(textLength * 0.8); // Stop before last 20%
|
| 87 |
|
| 88 |
+
let attempts = 0;
|
| 89 |
+
let passage = '';
|
| 90 |
+
|
| 91 |
+
while (attempts < 5) {
|
| 92 |
+
// Random position in the middle section
|
| 93 |
+
const availableLength = endAtThreeQuarters - startFromMiddle;
|
| 94 |
+
const randomOffset = Math.floor(Math.random() * Math.max(0, availableLength - 1000));
|
| 95 |
+
const startIndex = startFromMiddle + randomOffset;
|
| 96 |
+
|
| 97 |
+
// Extract longer initial passage for better sentence completion
|
| 98 |
+
passage = text.substring(startIndex, startIndex + 1000);
|
| 99 |
|
| 100 |
// Clean up start - find first complete sentence that starts with capital letter
|
| 101 |
const firstSentenceMatch = passage.match(/[.!?]\s+([A-Z][^.!?]*)/);
|
|
|
|
| 110 |
}
|
| 111 |
}
|
| 112 |
|
| 113 |
+
// Clean up end - ensure we end at a complete sentence
|
| 114 |
+
const sentences = passage.split(/(?<=[.!?])\s+/);
|
| 115 |
+
if (sentences.length > 1) {
|
| 116 |
+
// Remove the last sentence if it might be incomplete
|
| 117 |
+
sentences.pop();
|
| 118 |
+
passage = sentences.join(' ');
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
// Quality check: reject passages with excessive caps, numbers, or special formatting
|
| 122 |
+
const words = passage.split(/\s+/);
|
| 123 |
+
const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
|
| 124 |
+
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
| 125 |
+
const totalWords = words.length;
|
| 126 |
+
|
| 127 |
+
// Skip if more than 10% caps or 5% numbers
|
| 128 |
+
if (capsCount / totalWords > 0.1 || numbersCount / totalWords > 0.05) {
|
| 129 |
+
console.log(`Skipping passage with ${capsCount} caps and ${numbersCount} numbers out of ${totalWords} words`);
|
| 130 |
+
attempts++;
|
| 131 |
+
continue;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
// Check for other quality issues
|
| 135 |
+
if (passage.includes('CHAPTER') || passage.includes('Section') ||
|
| 136 |
+
passage.match(/\b(Fig\.|Table|Illustration)\b/)) {
|
| 137 |
+
attempts++;
|
| 138 |
+
continue;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
// Good passage found
|
| 142 |
+
break;
|
| 143 |
}
|
| 144 |
|
| 145 |
// Ensure minimum length - if too short, return what we have rather than infinite recursion
|
|
|
|
| 361 |
const contentWordIndices = [];
|
| 362 |
words.forEach((word, index) => {
|
| 363 |
const cleanWord = word.toLowerCase().replace(/[^\w]/g, '');
|
| 364 |
+
if (cleanWord.length > 3 && cleanWord.length <= 12 && !functionWords.has(cleanWord)) {
|
| 365 |
contentWordIndices.push({ word: cleanWord, index });
|
| 366 |
}
|
| 367 |
});
|