Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
|
@@ -185,7 +185,7 @@ class EnhancedURLProcessor:
|
|
| 185 |
'url': url,
|
| 186 |
'raw_content': None,
|
| 187 |
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
|
| 188 |
-
|
| 189 |
'extracted_data': None,
|
| 190 |
'processing_notes': [f"Failed to fetch content: {str(e)}"]
|
| 191 |
}
|
|
@@ -261,7 +261,9 @@ class EnhancedURLProcessor:
|
|
| 261 |
'title': None,
|
| 262 |
'meta_description': None,
|
| 263 |
'full_text': "",
|
| 264 |
-
'links': []
|
|
|
|
|
|
|
| 265 |
}
|
| 266 |
try:
|
| 267 |
soup = BeautifulSoup(content, 'html.parser')
|
|
@@ -270,6 +272,8 @@ class EnhancedURLProcessor:
|
|
| 270 |
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 271 |
if meta_desc and meta_desc.get('content'):
|
| 272 |
extracted['meta_description'] = meta_desc['content'].strip()
|
|
|
|
|
|
|
| 273 |
unique_links = set()
|
| 274 |
for a_tag in soup.find_all('a', href=True):
|
| 275 |
href = a_tag['href'].strip()
|
|
@@ -287,6 +291,27 @@ class EnhancedURLProcessor:
|
|
| 287 |
elif urlparse(href).netloc and href not in unique_links:
|
| 288 |
extracted['links'].append({'text': text, 'url': href})
|
| 289 |
unique_links.add(href)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
soup_copy = BeautifulSoup(content, 'html.parser')
|
| 291 |
for script_or_style in soup_copy(["script", "style"]):
|
| 292 |
script_or_style.extract()
|
|
@@ -701,7 +726,7 @@ class EnhancedFileProcessor:
|
|
| 701 |
elif archive_extension in ('.tar', '.gz', '.tgz'):
|
| 702 |
try:
|
| 703 |
mode = 'r'
|
| 704 |
-
if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz'
|
| 705 |
with tarfile.open(archive_path, mode) as tar_ref:
|
| 706 |
for member in tar_ref.getmembers():
|
| 707 |
if member.isfile():
|
|
@@ -739,30 +764,30 @@ class EnhancedFileProcessor:
|
|
| 739 |
f"Failed to clean up extracted file {extracted_file_path}: {e}")
|
| 740 |
except tarfile.TarError as e:
|
| 741 |
logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
|
| 742 |
-
elif archive_extension == '.gz':
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
elif archive_extension in ('.bz2', '.7z', '.rar'):
|
| 767 |
logger.warning(
|
| 768 |
f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
|
|
@@ -1041,9 +1066,9 @@ def respond_to_chat(
|
|
| 1041 |
filter_match = re.search(
|
| 1042 |
r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
|
| 1043 |
r'(?:where|by|for|with|if)\s+'
|
| 1044 |
-
r'(\w+)\s+'
|
| 1045 |
r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
|
| 1046 |
-
r'([\'"]?[\w\s
|
| 1047 |
lower_message
|
| 1048 |
)
|
| 1049 |
if filter_match:
|
|
@@ -1056,57 +1081,57 @@ def respond_to_chat(
|
|
| 1056 |
response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
|
| 1057 |
new_filtered_df_state = None
|
| 1058 |
else:
|
| 1059 |
-
|
| 1060 |
try:
|
| 1061 |
-
target_value: Any
|
| 1062 |
-
col_dtype =
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
|
|
|
|
|
|
|
| 1066 |
try:
|
| 1067 |
target_value = float(value_str)
|
| 1068 |
col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
|
| 1069 |
except ValueError:
|
| 1070 |
response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
|
| 1071 |
-
target_value = None
|
| 1072 |
elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
|
| 1073 |
target_value = value_str.lower() == 'true'
|
| 1074 |
col_series = df_to_filter[column_name].astype(bool, errors='ignore')
|
| 1075 |
-
else:
|
| 1076 |
target_value = str(value_str)
|
| 1077 |
col_series = df_to_filter[column_name].astype(str).str.lower()
|
| 1078 |
value_str_lower = target_value.lower()
|
| 1079 |
-
|
|
|
|
|
|
|
| 1080 |
if operator in ['is', 'equals', '==']:
|
| 1081 |
-
if
|
| 1082 |
-
col_dtype):
|
| 1083 |
condition = col_series == target_value
|
| 1084 |
else:
|
| 1085 |
condition = col_series == value_str_lower
|
| 1086 |
elif operator == '!=':
|
| 1087 |
-
if
|
| 1088 |
-
col_dtype):
|
| 1089 |
condition = col_series != target_value
|
| 1090 |
else:
|
| 1091 |
condition = col_series != value_str_lower
|
| 1092 |
-
elif operator == '>' and
|
| 1093 |
condition = col_series > target_value
|
| 1094 |
-
elif operator == '>=' and
|
| 1095 |
condition = col_series >= target_value
|
| 1096 |
-
elif operator == '<' and
|
| 1097 |
condition = col_series < target_value
|
| 1098 |
-
elif operator == '<=' and
|
| 1099 |
condition = col_series <= target_value
|
| 1100 |
-
elif operator in ['contains', 'contain']
|
| 1101 |
-
condition =
|
| 1102 |
-
elif operator == 'starts with'
|
| 1103 |
-
condition =
|
| 1104 |
-
elif operator == 'ends with'
|
| 1105 |
-
condition =
|
| 1106 |
else:
|
| 1107 |
response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
|
| 1108 |
-
|
| 1109 |
-
if response: new_filtered_df_state = None
|
| 1110 |
if condition is not None:
|
| 1111 |
filtered_results_df = df_to_filter[condition]
|
| 1112 |
if not filtered_results_df.empty:
|
|
@@ -1121,11 +1146,8 @@ def respond_to_chat(
|
|
| 1121 |
f"Here's a preview:\n```\n{preview_str}\n```\n"
|
| 1122 |
f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
|
| 1123 |
else:
|
| 1124 |
-
new_filtered_df_state = pd.DataFrame()
|
| 1125 |
response = f"No items found where '{column_name}' {operator} '{value_str}'."
|
| 1126 |
-
elif not response:
|
| 1127 |
-
response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
|
| 1128 |
-
new_filtered_df_state = None
|
| 1129 |
except ValueError as ve:
|
| 1130 |
response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
|
| 1131 |
new_filtered_df_state = None
|
|
@@ -1563,17 +1585,17 @@ def create_modern_interface():
|
|
| 1563 |
processing_status_messages.append(f"β
Processed URL: {url} (Level 0)")
|
| 1564 |
if content_result.get('processing_notes'):
|
| 1565 |
processing_status_messages.append(
|
| 1566 |
-
f"
|
| 1567 |
if content_result.get('linked_extractions'):
|
| 1568 |
num_linked_processed = len([r for r in content_result['linked_extractions'] if
|
| 1569 |
r and r.get('fetch_result') is not None])
|
| 1570 |
processing_status_messages.append(
|
| 1571 |
-
f"
|
| 1572 |
else:
|
| 1573 |
processing_status_messages.append(f"β Failed to process URL: {url}")
|
| 1574 |
if content_result.get('processing_notes'):
|
| 1575 |
processing_status_messages.append(
|
| 1576 |
-
f"
|
| 1577 |
else:
|
| 1578 |
processing_status_messages.append(
|
| 1579 |
f"β Failed to process URL: {url} (No result returned)")
|
|
@@ -1587,7 +1609,7 @@ def create_modern_interface():
|
|
| 1587 |
for res in file_results:
|
| 1588 |
if res.get('processing_notes'):
|
| 1589 |
processing_status_messages.append(
|
| 1590 |
-
f"
|
| 1591 |
else:
|
| 1592 |
processing_status_messages.append(f"β Failed to process file: {file.name}")
|
| 1593 |
qr_paths = []
|
|
|
|
| 185 |
'url': url,
|
| 186 |
'raw_content': None,
|
| 187 |
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
|
| 188 |
+
'status_code': getattr(e.response, 'status_code', None)},
|
| 189 |
'extracted_data': None,
|
| 190 |
'processing_notes': [f"Failed to fetch content: {str(e)}"]
|
| 191 |
}
|
|
|
|
| 261 |
'title': None,
|
| 262 |
'meta_description': None,
|
| 263 |
'full_text': "",
|
| 264 |
+
'links': [],
|
| 265 |
+
'images': [],
|
| 266 |
+
'media': []
|
| 267 |
}
|
| 268 |
try:
|
| 269 |
soup = BeautifulSoup(content, 'html.parser')
|
|
|
|
| 272 |
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 273 |
if meta_desc and meta_desc.get('content'):
|
| 274 |
extracted['meta_description'] = meta_desc['content'].strip()
|
| 275 |
+
|
| 276 |
+
# Extract links
|
| 277 |
unique_links = set()
|
| 278 |
for a_tag in soup.find_all('a', href=True):
|
| 279 |
href = a_tag['href'].strip()
|
|
|
|
| 291 |
elif urlparse(href).netloc and href not in unique_links:
|
| 292 |
extracted['links'].append({'text': text, 'url': href})
|
| 293 |
unique_links.add(href)
|
| 294 |
+
|
| 295 |
+
# Extract images
|
| 296 |
+
unique_images = set()
|
| 297 |
+
for img_tag in soup.find_all('img', src=True):
|
| 298 |
+
src = img_tag['src'].strip()
|
| 299 |
+
alt = img_tag.get('alt', '').strip()
|
| 300 |
+
if src and src not in unique_images:
|
| 301 |
+
absolute_url = urljoin(base_url, src)
|
| 302 |
+
extracted['images'].append({'src': absolute_url, 'alt': alt})
|
| 303 |
+
unique_images.add(src)
|
| 304 |
+
|
| 305 |
+
# Extract media (audio/video)
|
| 306 |
+
unique_media = set()
|
| 307 |
+
for media_tag in soup.find_all(['audio', 'video'], src=True):
|
| 308 |
+
src = media_tag['src'].strip()
|
| 309 |
+
if src and src not in unique_media:
|
| 310 |
+
absolute_url = urljoin(base_url, src)
|
| 311 |
+
extracted['media'].append({'src': absolute_url, 'type': media_tag.name})
|
| 312 |
+
unique_media.add(src)
|
| 313 |
+
|
| 314 |
+
# Extract text content
|
| 315 |
soup_copy = BeautifulSoup(content, 'html.parser')
|
| 316 |
for script_or_style in soup_copy(["script", "style"]):
|
| 317 |
script_or_style.extract()
|
|
|
|
| 726 |
elif archive_extension in ('.tar', '.gz', '.tgz'):
|
| 727 |
try:
|
| 728 |
mode = 'r'
|
| 729 |
+
if archive_extension in ('.tar.gz', '.tgz', '.gz'): mode = 'r:gz'
|
| 730 |
with tarfile.open(archive_path, mode) as tar_ref:
|
| 731 |
for member in tar_ref.getmembers():
|
| 732 |
if member.isfile():
|
|
|
|
| 764 |
f"Failed to clean up extracted file {extracted_file_path}: {e}")
|
| 765 |
except tarfile.TarError as e:
|
| 766 |
logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
|
| 767 |
+
elif archive_extension == '.gz': # This case is handled by tarfile, but added for single .gz files
|
| 768 |
+
extracted_name = archive_path.stem
|
| 769 |
+
extracted_path = extract_to / extracted_name
|
| 770 |
+
try:
|
| 771 |
+
with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
|
| 772 |
+
outfile.write(gz_file.read())
|
| 773 |
+
if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(
|
| 774 |
+
extracted_path):
|
| 775 |
+
dataset.extend(self._process_single_file(extracted_path))
|
| 776 |
+
elif extracted_path.suffix.lower() in self.archive_extensions:
|
| 777 |
+
logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
|
| 778 |
+
dataset.extend(self._process_archive(extracted_path, extract_to))
|
| 779 |
+
else:
|
| 780 |
+
logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
|
| 781 |
+
except gzip.GzipFile as e:
|
| 782 |
+
logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
|
| 783 |
+
except Exception as e:
|
| 784 |
+
logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
|
| 785 |
+
finally:
|
| 786 |
+
if extracted_path.exists():
|
| 787 |
+
try:
|
| 788 |
+
extracted_path.unlink()
|
| 789 |
+
except OSError as e:
|
| 790 |
+
logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
|
| 791 |
elif archive_extension in ('.bz2', '.7z', '.rar'):
|
| 792 |
logger.warning(
|
| 793 |
f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
|
|
|
|
| 1066 |
filter_match = re.search(
|
| 1067 |
r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
|
| 1068 |
r'(?:where|by|for|with|if)\s+'
|
| 1069 |
+
r'([\w\._-]+)\s+' # Allow underscores, periods, and hyphens in column names
|
| 1070 |
r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
|
| 1071 |
+
r'([\'"]?[\w\s\.-]+[\'"]?)',
|
| 1072 |
lower_message
|
| 1073 |
)
|
| 1074 |
if filter_match:
|
|
|
|
| 1081 |
response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
|
| 1082 |
new_filtered_df_state = None
|
| 1083 |
else:
|
| 1084 |
+
df_to_filter = df.copy() # Always filter from the full dataframe
|
| 1085 |
try:
|
| 1086 |
+
target_value: Any = None
|
| 1087 |
+
col_dtype = df_to_filter[column_name].dtype
|
| 1088 |
+
|
| 1089 |
+
is_numeric_op = operator in ['>', '>=', '<', '<=', '==', '!=']
|
| 1090 |
+
is_numeric_col = pd.api.types.is_numeric_dtype(col_dtype)
|
| 1091 |
+
|
| 1092 |
+
if is_numeric_op and is_numeric_col:
|
| 1093 |
try:
|
| 1094 |
target_value = float(value_str)
|
| 1095 |
col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
|
| 1096 |
except ValueError:
|
| 1097 |
response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
|
|
|
|
| 1098 |
elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
|
| 1099 |
target_value = value_str.lower() == 'true'
|
| 1100 |
col_series = df_to_filter[column_name].astype(bool, errors='ignore')
|
| 1101 |
+
else: # Treat as string
|
| 1102 |
target_value = str(value_str)
|
| 1103 |
col_series = df_to_filter[column_name].astype(str).str.lower()
|
| 1104 |
value_str_lower = target_value.lower()
|
| 1105 |
+
|
| 1106 |
+
if not response: # No error so far
|
| 1107 |
+
condition = None
|
| 1108 |
if operator in ['is', 'equals', '==']:
|
| 1109 |
+
if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
|
|
|
|
| 1110 |
condition = col_series == target_value
|
| 1111 |
else:
|
| 1112 |
condition = col_series == value_str_lower
|
| 1113 |
elif operator == '!=':
|
| 1114 |
+
if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
|
|
|
|
| 1115 |
condition = col_series != target_value
|
| 1116 |
else:
|
| 1117 |
condition = col_series != value_str_lower
|
| 1118 |
+
elif operator == '>' and is_numeric_col:
|
| 1119 |
condition = col_series > target_value
|
| 1120 |
+
elif operator == '>=' and is_numeric_col:
|
| 1121 |
condition = col_series >= target_value
|
| 1122 |
+
elif operator == '<' and is_numeric_col:
|
| 1123 |
condition = col_series < target_value
|
| 1124 |
+
elif operator == '<=' and is_numeric_col:
|
| 1125 |
condition = col_series <= target_value
|
| 1126 |
+
elif operator in ['contains', 'contain']:
|
| 1127 |
+
condition = df_to_filter[column_name].astype(str).str.contains(value_str, case=False, na=False)
|
| 1128 |
+
elif operator == 'starts with':
|
| 1129 |
+
condition = df_to_filter[column_name].astype(str).str.startswith(value_str, case=False, na=False)
|
| 1130 |
+
elif operator == 'ends with':
|
| 1131 |
+
condition = df_to_filter[column_name].astype(str).str.endswith(value_str, case=False, na=False)
|
| 1132 |
else:
|
| 1133 |
response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
|
| 1134 |
+
|
|
|
|
| 1135 |
if condition is not None:
|
| 1136 |
filtered_results_df = df_to_filter[condition]
|
| 1137 |
if not filtered_results_df.empty:
|
|
|
|
| 1146 |
f"Here's a preview:\n```\n{preview_str}\n```\n"
|
| 1147 |
f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
|
| 1148 |
else:
|
| 1149 |
+
new_filtered_df_state = pd.DataFrame() # Empty dataframe
|
| 1150 |
response = f"No items found where '{column_name}' {operator} '{value_str}'."
|
|
|
|
|
|
|
|
|
|
| 1151 |
except ValueError as ve:
|
| 1152 |
response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
|
| 1153 |
new_filtered_df_state = None
|
|
|
|
| 1585 |
processing_status_messages.append(f"β
Processed URL: {url} (Level 0)")
|
| 1586 |
if content_result.get('processing_notes'):
|
| 1587 |
processing_status_messages.append(
|
| 1588 |
+
f" Notes: {'; '.join(content_result['processing_notes'])}")
|
| 1589 |
if content_result.get('linked_extractions'):
|
| 1590 |
num_linked_processed = len([r for r in content_result['linked_extractions'] if
|
| 1591 |
r and r.get('fetch_result') is not None])
|
| 1592 |
processing_status_messages.append(
|
| 1593 |
+
f" Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
|
| 1594 |
else:
|
| 1595 |
processing_status_messages.append(f"β Failed to process URL: {url}")
|
| 1596 |
if content_result.get('processing_notes'):
|
| 1597 |
processing_status_messages.append(
|
| 1598 |
+
f" Notes: {'; '.join(content_result['processing_notes'])}")
|
| 1599 |
else:
|
| 1600 |
processing_status_messages.append(
|
| 1601 |
f"β Failed to process URL: {url} (No result returned)")
|
|
|
|
| 1609 |
for res in file_results:
|
| 1610 |
if res.get('processing_notes'):
|
| 1611 |
processing_status_messages.append(
|
| 1612 |
+
f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
|
| 1613 |
else:
|
| 1614 |
processing_status_messages.append(f"β Failed to process file: {file.name}")
|
| 1615 |
qr_paths = []
|