Spaces:
Running
on
Zero
Running
on
Zero
| # Define the root directory where the tar files are located | |
| root=$1 # /data/scratch/pyp/datasets/emilia/downloads | |
| save_root=$2 # /data/scratch/pyp/datasets/emilia/preprocessed/audio | |
| mkdir -p "${save_root}" | |
| # Input log files | |
| log_file="file_log.txt" # Full log of files to process | |
| exist_log_file="file_log_debug.txt" # Log of already processed files | |
| failure_log="untar_failures.log" # Log file for untar failures | |
| # Clear previous failure log | |
| > "$failure_log" | |
| # Create an array of filenames already processed (from exist_log_file) | |
| if [ -f "$exist_log_file" ]; then | |
| mapfile -t existing_files < "$exist_log_file" | |
| else | |
| existing_files=() | |
| fi | |
| # Create a temporary filtered log of files to process | |
| filtered_log="filtered_file_log.txt" | |
| grep -v -F -f "$exist_log_file" "$log_file" > "$filtered_log" | |
| # Count total filtered files | |
| total_files=$(wc -l < "$filtered_log") | |
| echo "Found $total_files entries to process in $filtered_log." | |
| # Print the filtered files | |
| echo "Filtered files to process:" | |
| cat "$filtered_log" | |
| echo | |
| # Confirm before starting processing | |
| read -p "Do you want to proceed with the above files? (y/n): " confirm | |
| if [[ "$confirm" != "y" ]]; then | |
| echo "Operation canceled." | |
| rm -f "$filtered_log" | |
| exit 1 | |
| fi | |
| # Start time | |
| start_time=$(date +%s) | |
| # Counter for how many lines we've processed | |
| count=0 | |
| # Process filtered log | |
| while IFS=',' read -r filename size local_sha256 original_filename url; do | |
| count=$((count + 1)) | |
| # Trim leading/trailing whitespace | |
| filename=$(echo "$filename" | xargs) | |
| size=$(echo "$size" | xargs) | |
| local_sha256=$(echo "$local_sha256" | xargs) | |
| original_filename=$(echo "$original_filename" | xargs) | |
| url=$(echo "$url" | xargs) | |
| # Construct the full path to the tar file | |
| tar_file="${root}/${original_filename}" | |
| # Check if the tar file exists | |
| if [ ! -f "$tar_file" ]; then | |
| echo "β File not found: $tar_file" | |
| echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log" | |
| else | |
| # Try to untar the file | |
| echo "[$count/$total_files] Untarring $tar_file..." | |
| if ! tar -xf "$tar_file" -C "${save_root}"; then | |
| # If untar fails, log the failure | |
| echo "β Failed to untar: $tar_file" | |
| echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log" | |
| else | |
| echo "β Successfully untarred: $tar_file" | |
| # Append successfully untarred filename to exist_log_file | |
| echo "$filename" >> "$exist_log_file" | |
| fi | |
| fi | |
| # Calculate elapsed time, average time per file, and ETA | |
| now=$(date +%s) | |
| elapsed=$(( now - start_time )) # total seconds since the start | |
| if [ $count -gt 0 ]; then | |
| avg_time=$(awk "BEGIN { printf \"%.2f\", $elapsed / $count }") | |
| remain=$(( total_files - count )) | |
| eta_seconds=$(awk "BEGIN { printf \"%.0f\", $avg_time * $remain }") | |
| eta_formatted=$(date -ud "@${eta_seconds}" +'%H:%M:%S') | |
| echo "Elapsed: ${elapsed}s | Avg/f: ${avg_time}s | Remaining: $remain files | ETA: ~${eta_formatted}" | |
| fi | |
| done < "$filtered_log" | |
| # Clean up temporary filtered log | |
| rm -f "$filtered_log" | |
| # Summary | |
| echo "Untar operation completed. Check $failure_log for any failures." |