Spaces:

milwright
/

cloze-reader

Sleeping

milwright commited on Jun 17

Commit

e26fd03

1 Parent(s): bac89b2

Improve content filtering and word validation

- Enhanced passage quality filtering with statistical analysis instead of hardcoded patterns
- Added problematic word filtering to prevent inappropriate vocabulary selection
- Improved fallback logic when AI word selection fails validation
- Better narrative content detection to avoid technical/reference material

Files changed (2) hide show

src/aiService.js +26 -3
src/clozeGameEngine.js +28 -13

src/aiService.js CHANGED Viewed

@@ -169,6 +169,7 @@ REQUIREMENTS:
 - Words must appear EXACTLY as written in the passage
 - Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
 - Skip any words that look malformed or concatenated
 - NEVER select words from the first or last sentence/clause of the passage
 - Choose words from the middle portions for better context dependency
@@ -205,9 +206,16 @@ Passage: "${passage}"`
         try {
           const words = JSON.parse(content);
           if (Array.isArray(words)) {
-            // Validate word lengths based on level
             const validWords = words.filter(word => {
               const cleanWord = word.replace(/[^a-zA-Z]/g, '');
               if (level <= 2) {
                 return cleanWord.length >= 4 && cleanWord.length <= 7;
               } else if (level <= 4) {
@@ -230,9 +238,16 @@ Passage: "${passage}"`
           const matches = content.match(/"([^"]+)"/g);
           if (matches) {
             const words = matches.map(m => m.replace(/"/g, ''));
-            // Validate word lengths
             const validWords = words.filter(word => {
               const cleanWord = word.replace(/[^a-zA-Z]/g, '');
               if (level <= 2) {
                 return cleanWord.length >= 4 && cleanWord.length <= 7;
               } else if (level <= 4) {
@@ -316,6 +331,7 @@ SELECTION RULES:
 - Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
 - Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
 - Avoid capitalized words, ALL-CAPS words, and table of contents entries
 - NEVER select words from the first or last sentence/clause of each passage
 - Choose words from the middle portions for better context dependency
 - Words must appear EXACTLY as written in the passage
@@ -411,10 +427,17 @@ Return as JSON: {"passage1": {...}, "passage2": {...}}`
         parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
         parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
-        // Validate word lengths based on level
         const validateWords = (words) => {
           return words.filter(word => {
             const cleanWord = word.replace(/[^a-zA-Z]/g, '');
             if (level <= 2) {
               return cleanWord.length >= 4 && cleanWord.length <= 7;
             } else if (level <= 4) {

 - Words must appear EXACTLY as written in the passage
 - Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
 - Skip any words that look malformed or concatenated
+- Avoid dated or potentially offensive terms
 - NEVER select words from the first or last sentence/clause of the passage
 - Choose words from the middle portions for better context dependency
         try {
           const words = JSON.parse(content);
           if (Array.isArray(words)) {
+            // Filter problematic words and validate word lengths based on level
+            const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
             const validWords = words.filter(word => {
               const cleanWord = word.replace(/[^a-zA-Z]/g, '');
+              const lowerWord = cleanWord.toLowerCase();
+              // Skip problematic words
+              if (problematicWords.includes(lowerWord)) return false;
+              // Check length constraints
               if (level <= 2) {
                 return cleanWord.length >= 4 && cleanWord.length <= 7;
               } else if (level <= 4) {
           const matches = content.match(/"([^"]+)"/g);
           if (matches) {
             const words = matches.map(m => m.replace(/"/g, ''));
+            // Filter problematic words and validate word lengths
+            const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
             const validWords = words.filter(word => {
               const cleanWord = word.replace(/[^a-zA-Z]/g, '');
+              const lowerWord = cleanWord.toLowerCase();
+              // Skip problematic words
+              if (problematicWords.includes(lowerWord)) return false;
+              // Check length constraints
               if (level <= 2) {
                 return cleanWord.length >= 4 && cleanWord.length <= 7;
               } else if (level <= 4) {
 - Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
 - Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
 - Avoid capitalized words, ALL-CAPS words, and table of contents entries
+- Avoid dated or potentially offensive terms
 - NEVER select words from the first or last sentence/clause of each passage
 - Choose words from the middle portions for better context dependency
 - Words must appear EXACTLY as written in the passage
         parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
         parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
+        // Filter problematic words and validate word lengths based on level
         const validateWords = (words) => {
+          const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
           return words.filter(word => {
             const cleanWord = word.replace(/[^a-zA-Z]/g, '');
+            const lowerWord = cleanWord.toLowerCase();
+            // Skip problematic words
+            if (problematicWords.includes(lowerWord)) return false;
+            // Check length constraints
             if (level <= 2) {
               return cleanWord.length >= 4 && cleanWord.length <= 7;
             } else if (level <= 4) {

src/clozeGameEngine.js CHANGED Viewed

@@ -148,22 +148,37 @@ class ClozeGame {
         passage = sentences.join(' ');
       }
-      // Quality check: reject passages with excessive caps, numbers, or special formatting
       const words = passage.split(/\s+/);
-      const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
-      const numbersCount = words.filter(w => /\d/.test(w)).length;
       const totalWords = words.length;
-      // Skip if more than 10% caps or 5% numbers
-      if (capsCount / totalWords > 0.1 || numbersCount / totalWords > 0.05) {
-        console.log(`Skipping passage with ${capsCount} caps and ${numbersCount} numbers out of ${totalWords} words`);
-        attempts++;
-        continue;
-      }
-      // Check for other quality issues
-      if (passage.includes('CHAPTER') || passage.includes('Section') ||
-          passage.match(/\b(Fig\.|Table|Illustration)\b/)) {
         attempts++;
         continue;
       }

         passage = sentences.join(' ');
       }
+      // Enhanced quality check based on narrative flow characteristics
       const words = passage.split(/\s+/);
       const totalWords = words.length;
+      // Count various quality indicators
+      const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
+      const numbersCount = words.filter(w => /\d/.test(w)).length;
+      const shortWords = words.filter(w => w.length <= 3).length;
+      const punctuationMarks = (passage.match(/[;:()[\]{}]/g) || []).length;
+      const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
+      // Calculate quality ratios
+      const capsRatio = capsCount / totalWords;
+      const numbersRatio = numbersCount / totalWords;
+      const shortWordRatio = shortWords / totalWords;
+      const punctuationRatio = punctuationMarks / totalWords;
+      const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
+      // Reject if passage shows signs of being technical/reference material
+      let qualityScore = 0;
+      let issues = [];
+      if (capsRatio > 0.05) { qualityScore += capsRatio * 20; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
+      if (numbersRatio > 0.03) { qualityScore += numbersRatio * 30; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
+      if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
+      if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
+      if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
+      // Reject if quality score indicates technical/non-narrative content
+      if (qualityScore > 3) {
+        console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);
         attempts++;
         continue;
       }