Spaces:
Sleeping
Sleeping
Improve content filtering and word validation
Browse files- Enhanced passage quality filtering with statistical analysis instead of hardcoded patterns
- Added problematic word filtering to prevent inappropriate vocabulary selection
- Improved fallback logic when AI word selection fails validation
- Better narrative content detection to avoid technical/reference material
- src/aiService.js +26 -3
- src/clozeGameEngine.js +28 -13
src/aiService.js
CHANGED
|
@@ -169,6 +169,7 @@ REQUIREMENTS:
|
|
| 169 |
- Words must appear EXACTLY as written in the passage
|
| 170 |
- Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
|
| 171 |
- Skip any words that look malformed or concatenated
|
|
|
|
| 172 |
- NEVER select words from the first or last sentence/clause of the passage
|
| 173 |
- Choose words from the middle portions for better context dependency
|
| 174 |
|
|
@@ -205,9 +206,16 @@ Passage: "${passage}"`
|
|
| 205 |
try {
|
| 206 |
const words = JSON.parse(content);
|
| 207 |
if (Array.isArray(words)) {
|
| 208 |
-
//
|
|
|
|
| 209 |
const validWords = words.filter(word => {
|
| 210 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
if (level <= 2) {
|
| 212 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
| 213 |
} else if (level <= 4) {
|
|
@@ -230,9 +238,16 @@ Passage: "${passage}"`
|
|
| 230 |
const matches = content.match(/"([^"]+)"/g);
|
| 231 |
if (matches) {
|
| 232 |
const words = matches.map(m => m.replace(/"/g, ''));
|
| 233 |
-
//
|
|
|
|
| 234 |
const validWords = words.filter(word => {
|
| 235 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
if (level <= 2) {
|
| 237 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
| 238 |
} else if (level <= 4) {
|
|
@@ -316,6 +331,7 @@ SELECTION RULES:
|
|
| 316 |
- Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
|
| 317 |
- Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
|
| 318 |
- Avoid capitalized words, ALL-CAPS words, and table of contents entries
|
|
|
|
| 319 |
- NEVER select words from the first or last sentence/clause of each passage
|
| 320 |
- Choose words from the middle portions for better context dependency
|
| 321 |
- Words must appear EXACTLY as written in the passage
|
|
@@ -411,10 +427,17 @@ Return as JSON: {"passage1": {...}, "passage2": {...}}`
|
|
| 411 |
parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
|
| 412 |
parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
|
| 413 |
|
| 414 |
-
//
|
| 415 |
const validateWords = (words) => {
|
|
|
|
| 416 |
return words.filter(word => {
|
| 417 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
if (level <= 2) {
|
| 419 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
| 420 |
} else if (level <= 4) {
|
|
|
|
| 169 |
- Words must appear EXACTLY as written in the passage
|
| 170 |
- Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
|
| 171 |
- Skip any words that look malformed or concatenated
|
| 172 |
+
- Avoid dated or potentially offensive terms
|
| 173 |
- NEVER select words from the first or last sentence/clause of the passage
|
| 174 |
- Choose words from the middle portions for better context dependency
|
| 175 |
|
|
|
|
| 206 |
try {
|
| 207 |
const words = JSON.parse(content);
|
| 208 |
if (Array.isArray(words)) {
|
| 209 |
+
// Filter problematic words and validate word lengths based on level
|
| 210 |
+
const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
|
| 211 |
const validWords = words.filter(word => {
|
| 212 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
| 213 |
+
const lowerWord = cleanWord.toLowerCase();
|
| 214 |
+
|
| 215 |
+
// Skip problematic words
|
| 216 |
+
if (problematicWords.includes(lowerWord)) return false;
|
| 217 |
+
|
| 218 |
+
// Check length constraints
|
| 219 |
if (level <= 2) {
|
| 220 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
| 221 |
} else if (level <= 4) {
|
|
|
|
| 238 |
const matches = content.match(/"([^"]+)"/g);
|
| 239 |
if (matches) {
|
| 240 |
const words = matches.map(m => m.replace(/"/g, ''));
|
| 241 |
+
// Filter problematic words and validate word lengths
|
| 242 |
+
const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
|
| 243 |
const validWords = words.filter(word => {
|
| 244 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
| 245 |
+
const lowerWord = cleanWord.toLowerCase();
|
| 246 |
+
|
| 247 |
+
// Skip problematic words
|
| 248 |
+
if (problematicWords.includes(lowerWord)) return false;
|
| 249 |
+
|
| 250 |
+
// Check length constraints
|
| 251 |
if (level <= 2) {
|
| 252 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
| 253 |
} else if (level <= 4) {
|
|
|
|
| 331 |
- Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
|
| 332 |
- Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
|
| 333 |
- Avoid capitalized words, ALL-CAPS words, and table of contents entries
|
| 334 |
+
- Avoid dated or potentially offensive terms
|
| 335 |
- NEVER select words from the first or last sentence/clause of each passage
|
| 336 |
- Choose words from the middle portions for better context dependency
|
| 337 |
- Words must appear EXACTLY as written in the passage
|
|
|
|
| 427 |
parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
|
| 428 |
parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
|
| 429 |
|
| 430 |
+
// Filter problematic words and validate word lengths based on level
|
| 431 |
const validateWords = (words) => {
|
| 432 |
+
const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
|
| 433 |
return words.filter(word => {
|
| 434 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
| 435 |
+
const lowerWord = cleanWord.toLowerCase();
|
| 436 |
+
|
| 437 |
+
// Skip problematic words
|
| 438 |
+
if (problematicWords.includes(lowerWord)) return false;
|
| 439 |
+
|
| 440 |
+
// Check length constraints
|
| 441 |
if (level <= 2) {
|
| 442 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
| 443 |
} else if (level <= 4) {
|
src/clozeGameEngine.js
CHANGED
|
@@ -148,22 +148,37 @@ class ClozeGame {
|
|
| 148 |
passage = sentences.join(' ');
|
| 149 |
}
|
| 150 |
|
| 151 |
-
//
|
| 152 |
const words = passage.split(/\s+/);
|
| 153 |
-
const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
|
| 154 |
-
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
| 155 |
const totalWords = words.length;
|
| 156 |
|
| 157 |
-
//
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
//
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
attempts++;
|
| 168 |
continue;
|
| 169 |
}
|
|
|
|
| 148 |
passage = sentences.join(' ');
|
| 149 |
}
|
| 150 |
|
| 151 |
+
// Enhanced quality check based on narrative flow characteristics
|
| 152 |
const words = passage.split(/\s+/);
|
|
|
|
|
|
|
| 153 |
const totalWords = words.length;
|
| 154 |
|
| 155 |
+
// Count various quality indicators
|
| 156 |
+
const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
|
| 157 |
+
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
| 158 |
+
const shortWords = words.filter(w => w.length <= 3).length;
|
| 159 |
+
const punctuationMarks = (passage.match(/[;:()[\]{}]/g) || []).length;
|
| 160 |
+
const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
| 161 |
+
|
| 162 |
+
// Calculate quality ratios
|
| 163 |
+
const capsRatio = capsCount / totalWords;
|
| 164 |
+
const numbersRatio = numbersCount / totalWords;
|
| 165 |
+
const shortWordRatio = shortWords / totalWords;
|
| 166 |
+
const punctuationRatio = punctuationMarks / totalWords;
|
| 167 |
+
const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
|
| 168 |
+
|
| 169 |
+
// Reject if passage shows signs of being technical/reference material
|
| 170 |
+
let qualityScore = 0;
|
| 171 |
+
let issues = [];
|
| 172 |
+
|
| 173 |
+
if (capsRatio > 0.05) { qualityScore += capsRatio * 20; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
|
| 174 |
+
if (numbersRatio > 0.03) { qualityScore += numbersRatio * 30; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
|
| 175 |
+
if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
|
| 176 |
+
if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
|
| 177 |
+
if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
|
| 178 |
+
|
| 179 |
+
// Reject if quality score indicates technical/non-narrative content
|
| 180 |
+
if (qualityScore > 3) {
|
| 181 |
+
console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);
|
| 182 |
attempts++;
|
| 183 |
continue;
|
| 184 |
}
|