Spaces:

larawehbe
/

sehatech-demo

Build error

sehatech-demo / test_download_data.sh

Upload folder using huggingface_hub

965ac15 verified about 1 year ago

1.4 kB

	#!/bin/bash

	# Specify the number of articles to download
	limit=10

	# Fetch the list of articles with metadata in XML format
	response=$(curl -s "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf&limit=$limit")

	# Parse each record in the response
	echo "$response" \| while read -r line; do
	# Extract the PMC ID
	if [[ $line =~ id=\"(PMC[0-9]+)\" ]]; then
	pmc_id="${BASH_REMATCH[1]}"
	echo "Processing article ID: $pmc_id"

	# Extract the title for metadata
	title=$(echo "$response" \| sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" \| sed -n 's/.citation="$.$".*/\1/p')

	# Extract the PDF link for download
	pdf_link=$(echo "$response" \| sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" \| sed -n 's/.<link format="pdf"[^>] href="$[^"]$"./\1/p')

	# Check if we found a PDF link
	if [[ -n $pdf_link ]]; then
	# Print metadata
	echo "Title: $title"
	echo "Downloading PDF from: $pdf_link"

	# Download the PDF
	curl -O "$pdf_link"

	# Optional: Save metadata to a file
	echo "Title: $title" >> metadata.txt
	echo "PDF Link: $pdf_link" >> metadata.txt
	echo "---------------------" >> metadata.txt
	else
	echo "No PDF link found for article ID: $pmc_id"
	fi
	fi
	done