Spaces:
Running
Running
fix book preloading and filtering issues
Browse files- Relax isValidForCloze filtering criteria to allow more books through
- Reduce minimum text length from 5000 to 2000 characters
- Increase fragmentation threshold from 0.01 to 0.05
- Reduce minimum sentence count from 20 to 10
- Add enhanced error handling in preloadBooks function
- Add structure validation for HF API responses
- Add try-catch around individual book processing
- Improve logging to track book validation process
- src/bookDataService.js +24 -6
src/bookDataService.js
CHANGED
|
@@ -138,12 +138,29 @@ class HuggingFaceDatasetService {
|
|
| 138 |
if (response.ok) {
|
| 139 |
const data = await response.json();
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
// Process and filter books
|
| 142 |
this.preloadedBooks = data.rows
|
| 143 |
-
.map(row =>
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
console.log(`π Preloaded ${this.preloadedBooks.length} suitable books`);
|
|
|
|
|
|
|
| 147 |
}
|
| 148 |
} catch (error) {
|
| 149 |
console.warn('Failed to preload books:', error);
|
|
@@ -336,18 +353,19 @@ class HuggingFaceDatasetService {
|
|
| 336 |
|
| 337 |
const textLength = book.text.length;
|
| 338 |
|
| 339 |
-
//
|
| 340 |
-
if (textLength <
|
| 341 |
if (textLength > 500000) return false; // Too long for performance
|
| 342 |
|
| 343 |
// Check for excessive formatting (likely reference material)
|
| 344 |
const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
|
| 345 |
-
if (lineBreakRatio > 0.
|
| 346 |
|
| 347 |
// Ensure it has actual narrative content
|
| 348 |
const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
|
| 349 |
-
if (sentenceCount <
|
| 350 |
|
|
|
|
| 351 |
return true;
|
| 352 |
}
|
| 353 |
|
|
|
|
| 138 |
if (response.ok) {
|
| 139 |
const data = await response.json();
|
| 140 |
|
| 141 |
+
// Check if data has expected structure
|
| 142 |
+
if (!data.rows || !Array.isArray(data.rows)) {
|
| 143 |
+
console.error('Unexpected HF API response structure:', data);
|
| 144 |
+
return;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
console.log(`π₯ Received ${data.rows.length} books from HF API`);
|
| 148 |
+
|
| 149 |
// Process and filter books
|
| 150 |
this.preloadedBooks = data.rows
|
| 151 |
+
.map(row => {
|
| 152 |
+
try {
|
| 153 |
+
return this.processHFBook(row.row);
|
| 154 |
+
} catch (e) {
|
| 155 |
+
console.warn('Error processing book:', e);
|
| 156 |
+
return null;
|
| 157 |
+
}
|
| 158 |
+
})
|
| 159 |
+
.filter(book => book && this.isValidForCloze(book));
|
| 160 |
|
| 161 |
console.log(`π Preloaded ${this.preloadedBooks.length} suitable books`);
|
| 162 |
+
} else {
|
| 163 |
+
console.error(`HF API request failed: ${response.status} ${response.statusText}`);
|
| 164 |
}
|
| 165 |
} catch (error) {
|
| 166 |
console.warn('Failed to preload books:', error);
|
|
|
|
| 353 |
|
| 354 |
const textLength = book.text.length;
|
| 355 |
|
| 356 |
+
// Relaxed filter criteria for cloze exercises
|
| 357 |
+
if (textLength < 2000) return false; // Minimum readable length
|
| 358 |
if (textLength > 500000) return false; // Too long for performance
|
| 359 |
|
| 360 |
// Check for excessive formatting (likely reference material)
|
| 361 |
const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
|
| 362 |
+
if (lineBreakRatio > 0.05) return false; // Relaxed fragmentation threshold
|
| 363 |
|
| 364 |
// Ensure it has actual narrative content
|
| 365 |
const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
|
| 366 |
+
if (sentenceCount < 10) return false; // Relaxed sentence requirement
|
| 367 |
|
| 368 |
+
console.log(`π Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
|
| 369 |
return true;
|
| 370 |
}
|
| 371 |
|