Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -147,7 +147,7 @@ def create_prompt(extracted_text: str) -> str:
|
|
| 147 |
|
| 148 |
return prompt
|
| 149 |
|
| 150 |
-
def extract_data_with_gemini(text_file_path: str) -> dict:
|
| 151 |
try:
|
| 152 |
# Initialize Gemini
|
| 153 |
model = initialize_gemini()
|
|
@@ -157,7 +157,7 @@ def extract_data_with_gemini(text_file_path: str) -> dict:
|
|
| 157 |
extracted_text = f.read()
|
| 158 |
|
| 159 |
# Create prompt and get response
|
| 160 |
-
prompt = create_prompt(extracted_text)
|
| 161 |
response = model.generate_content(prompt)
|
| 162 |
|
| 163 |
# Parse the JSON response
|
|
@@ -178,6 +178,7 @@ def extract_data_with_gemini(text_file_path: str) -> dict:
|
|
| 178 |
|
| 179 |
# Main Processing Function
|
| 180 |
def process_pdf(pdf_file):
|
|
|
|
| 181 |
temp_dir = os.path.join(os.getcwd(), "temp_processing")
|
| 182 |
output_dir = os.path.join(temp_dir, 'output_images')
|
| 183 |
|
|
@@ -185,6 +186,9 @@ def process_pdf(pdf_file):
|
|
| 185 |
shutil.rmtree(temp_dir)
|
| 186 |
os.makedirs(output_dir, exist_ok=True)
|
| 187 |
|
|
|
|
|
|
|
|
|
|
| 188 |
try:
|
| 189 |
# Convert PDF to images and process
|
| 190 |
images = convert_from_path(pdf_file.name)
|
|
@@ -206,7 +210,7 @@ def process_pdf(pdf_file):
|
|
| 206 |
text_file_path = os.path.join(output_dir, 'extracted_text.txt')
|
| 207 |
|
| 208 |
# Process with Gemini
|
| 209 |
-
extracted_data = extract_data_with_gemini(text_file_path)
|
| 210 |
|
| 211 |
# Save extracted data to JSON file
|
| 212 |
json_path = os.path.join(temp_dir, "extracted_data.json")
|
|
|
|
| 147 |
|
| 148 |
return prompt
|
| 149 |
|
| 150 |
+
def extract_data_with_gemini(text_file_path: str, path_to_data_to_extract: str) -> dict:
|
| 151 |
try:
|
| 152 |
# Initialize Gemini
|
| 153 |
model = initialize_gemini()
|
|
|
|
| 157 |
extracted_text = f.read()
|
| 158 |
|
| 159 |
# Create prompt and get response
|
| 160 |
+
prompt = create_prompt(extracted_text, path_to_data_to_extract)
|
| 161 |
response = model.generate_content(prompt)
|
| 162 |
|
| 163 |
# Parse the JSON response
|
|
|
|
| 178 |
|
| 179 |
# Main Processing Function
|
| 180 |
def process_pdf(pdf_file):
|
| 181 |
+
template_dir = os.path.join(os.getcwd(), "templates")
|
| 182 |
temp_dir = os.path.join(os.getcwd(), "temp_processing")
|
| 183 |
output_dir = os.path.join(temp_dir, 'output_images')
|
| 184 |
|
|
|
|
| 186 |
shutil.rmtree(temp_dir)
|
| 187 |
os.makedirs(output_dir, exist_ok=True)
|
| 188 |
|
| 189 |
+
## JSON of teh data to extract with descriptions
|
| 190 |
+
path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
|
| 191 |
+
|
| 192 |
try:
|
| 193 |
# Convert PDF to images and process
|
| 194 |
images = convert_from_path(pdf_file.name)
|
|
|
|
| 210 |
text_file_path = os.path.join(output_dir, 'extracted_text.txt')
|
| 211 |
|
| 212 |
# Process with Gemini
|
| 213 |
+
extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
|
| 214 |
|
| 215 |
# Save extracted data to JSON file
|
| 216 |
json_path = os.path.join(temp_dir, "extracted_data.json")
|