Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Aug 4

Commit

a8cd52c

verified ·

1 Parent(s): fd61735

Update app2.py

Browse files

Files changed (1) hide show

app2.py +84 -62

app2.py CHANGED Viewed

@@ -185,7 +185,7 @@ class EnhancedURLProcessor:
                 'url': url,
                 'raw_content': None,
                 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
-                             'status_code': getattr(e.response, 'status_code', None)},
                 'extracted_data': None,
                 'processing_notes': [f"Failed to fetch content: {str(e)}"]
             }
@@ -261,7 +261,9 @@ class EnhancedURLProcessor:
             'title': None,
             'meta_description': None,
             'full_text': "",
-            'links': []
         }
         try:
             soup = BeautifulSoup(content, 'html.parser')
@@ -270,6 +272,8 @@ class EnhancedURLProcessor:
             meta_desc = soup.find('meta', attrs={'name': 'description'})
             if meta_desc and meta_desc.get('content'):
                 extracted['meta_description'] = meta_desc['content'].strip()
             unique_links = set()
             for a_tag in soup.find_all('a', href=True):
                 href = a_tag['href'].strip()
@@ -287,6 +291,27 @@ class EnhancedURLProcessor:
                         elif urlparse(href).netloc and href not in unique_links:
                             extracted['links'].append({'text': text, 'url': href})
                             unique_links.add(href)
             soup_copy = BeautifulSoup(content, 'html.parser')
             for script_or_style in soup_copy(["script", "style"]):
                 script_or_style.extract()
@@ -701,7 +726,7 @@ class EnhancedFileProcessor:
             elif archive_extension in ('.tar', '.gz', '.tgz'):
                 try:
                     mode = 'r'
-                    if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz'
                     with tarfile.open(archive_path, mode) as tar_ref:
                         for member in tar_ref.getmembers():
                             if member.isfile():
@@ -739,30 +764,30 @@ class EnhancedFileProcessor:
                                                 f"Failed to clean up extracted file {extracted_file_path}: {e}")
                 except tarfile.TarError as e:
                     logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
-            elif archive_extension == '.gz':
-                extracted_name = archive_path.stem
-                extracted_path = extract_to / extracted_name
-                try:
-                    with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
-                        outfile.write(gz_file.read())
-                    if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(
-                            extracted_path):
-                        dataset.extend(self._process_single_file(extracted_path))
-                    elif extracted_path.suffix.lower() in self.archive_extensions:
-                        logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
-                        dataset.extend(self._process_archive(extracted_path, extract_to))
-                    else:
-                        logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
-                except gzip.GzipFile as e:
-                    logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
-                except Exception as e:
-                    logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
-                finally:
-                    if extracted_path.exists():
-                        try:
-                            extracted_path.unlink()
-                        except OSError as e:
-                            logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
             elif archive_extension in ('.bz2', '.7z', '.rar'):
                 logger.warning(
                     f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
@@ -1041,9 +1066,9 @@ def respond_to_chat(
             filter_match = re.search(
                 r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
                 r'(?:where|by|for|with|if)\s+'
-                r'(\w+)\s+'
                 r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
-                r'([\'"]?[\w\s.-]+[\'"]?)',
                 lower_message
             )
             if filter_match:
@@ -1056,57 +1081,57 @@ def respond_to_chat(
                     response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
                     new_filtered_df_state = None
                 else:
-                    active_df_to_filter = df.copy()
                     try:
-                        target_value: Any
-                        col_dtype = df[column_name].dtype
-                        df_to_filter = current_filtered_df_state if current_filtered_df_state is not None and not current_filtered_df_state.empty else df.copy()
-                        if pd.api.types.is_numeric_dtype(col_dtype) and operator in ['>', '>=', '<', '<=', '==',
-                                                                                     '!=']:
                             try:
                                 target_value = float(value_str)
                                 col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
                             except ValueError:
                                 response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
-                                target_value = None
                         elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
                             target_value = value_str.lower() == 'true'
                             col_series = df_to_filter[column_name].astype(bool, errors='ignore')
-                        else:
                             target_value = str(value_str)
                             col_series = df_to_filter[column_name].astype(str).str.lower()
                             value_str_lower = target_value.lower()
-                        if 'response' not in locals():
                             if operator in ['is', 'equals', '==']:
-                                if pd.api.types.is_numeric_dtype(col_dtype) or pd.api.types.is_bool_dtype(
-                                        col_dtype):
                                     condition = col_series == target_value
                                 else:
                                     condition = col_series == value_str_lower
                             elif operator == '!=':
-                                if pd.api.types.is_numeric_dtype(col_dtype) or pd.api.types.is_bool_dtype(
-                                        col_dtype):
                                     condition = col_series != target_value
                                 else:
                                     condition = col_series != value_str_lower
-                            elif operator == '>' and pd.api.types.is_numeric_dtype(col_dtype):
                                 condition = col_series > target_value
-                            elif operator == '>=' and pd.api.types.is_numeric_dtype(col_dtype):
                                 condition = col_series >= target_value
-                            elif operator == '<' and pd.api.types.is_numeric_dtype(col_dtype):
                                 condition = col_series < target_value
-                            elif operator == '<=' and pd.api.types.is_numeric_dtype(col_dtype):
                                 condition = col_series <= target_value
-                            elif operator in ['contains', 'contain'] and pd.api.types.is_string_dtype(col_series):
-                                condition = col_series.str.contains(value_str_lower, case=False, na=False)
-                            elif operator == 'starts with' and pd.api.types.is_string_dtype(col_series):
-                                condition = col_series.str.startswith(value_str_lower, na=False)
-                            elif operator == 'ends with' and pd.api.types.is_string_dtype(col_series):
-                                condition = col_series.str.endswith(value_str_lower, na=False)
                             else:
                                 response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
-                                condition = None
-                                if response: new_filtered_df_state = None
                             if condition is not None:
                                 filtered_results_df = df_to_filter[condition]
                                 if not filtered_results_df.empty:
@@ -1121,11 +1146,8 @@ def respond_to_chat(
                                         f"Here's a preview:\n```\n{preview_str}\n```\n"
                                         f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
                                 else:
-                                    new_filtered_df_state = pd.DataFrame()
                                     response = f"No items found where '{column_name}' {operator} '{value_str}'."
-                            elif not response:
-                                response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
-                                new_filtered_df_state = None
                     except ValueError as ve:
                         response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
                         new_filtered_df_state = None
@@ -1563,17 +1585,17 @@ def create_modern_interface():
                                 processing_status_messages.append(f"✅ Processed URL: {url} (Level 0)")
                                 if content_result.get('processing_notes'):
                                     processing_status_messages.append(
-                                        f"   Notes: {'; '.join(content_result['processing_notes'])}")
                                 if content_result.get('linked_extractions'):
                                     num_linked_processed = len([r for r in content_result['linked_extractions'] if
                                                                 r and r.get('fetch_result') is not None])
                                     processing_status_messages.append(
-                                        f"   Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
                             else:
                                 processing_status_messages.append(f"❌ Failed to process URL: {url}")
                                 if content_result.get('processing_notes'):
                                     processing_status_messages.append(
-                                        f"   Notes: {'; '.join(content_result['processing_notes'])}")
                         else:
                             processing_status_messages.append(
                                 f"❌ Failed to process URL: {url} (No result returned)")
@@ -1587,7 +1609,7 @@ def create_modern_interface():
                             for res in file_results:
                                 if res.get('processing_notes'):
                                     processing_status_messages.append(
-                                        f"   Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
                         else:
                             processing_status_messages.append(f"❌ Failed to process file: {file.name}")
                 qr_paths = []

                 'url': url,
                 'raw_content': None,
                 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
+                               'status_code': getattr(e.response, 'status_code', None)},
                 'extracted_data': None,
                 'processing_notes': [f"Failed to fetch content: {str(e)}"]
             }
             'title': None,
             'meta_description': None,
             'full_text': "",
+            'links': [],
+            'images': [],
+            'media': []
         }
         try:
             soup = BeautifulSoup(content, 'html.parser')
             meta_desc = soup.find('meta', attrs={'name': 'description'})
             if meta_desc and meta_desc.get('content'):
                 extracted['meta_description'] = meta_desc['content'].strip()
+            # Extract links
             unique_links = set()
             for a_tag in soup.find_all('a', href=True):
                 href = a_tag['href'].strip()
                         elif urlparse(href).netloc and href not in unique_links:
                             extracted['links'].append({'text': text, 'url': href})
                             unique_links.add(href)
+            # Extract images
+            unique_images = set()
+            for img_tag in soup.find_all('img', src=True):
+                src = img_tag['src'].strip()
+                alt = img_tag.get('alt', '').strip()
+                if src and src not in unique_images:
+                    absolute_url = urljoin(base_url, src)
+                    extracted['images'].append({'src': absolute_url, 'alt': alt})
+                    unique_images.add(src)
+            # Extract media (audio/video)
+            unique_media = set()
+            for media_tag in soup.find_all(['audio', 'video'], src=True):
+                src = media_tag['src'].strip()
+                if src and src not in unique_media:
+                    absolute_url = urljoin(base_url, src)
+                    extracted['media'].append({'src': absolute_url, 'type': media_tag.name})
+                    unique_media.add(src)
+            # Extract text content
             soup_copy = BeautifulSoup(content, 'html.parser')
             for script_or_style in soup_copy(["script", "style"]):
                 script_or_style.extract()
             elif archive_extension in ('.tar', '.gz', '.tgz'):
                 try:
                     mode = 'r'
+                    if archive_extension in ('.tar.gz', '.tgz', '.gz'): mode = 'r:gz'
                     with tarfile.open(archive_path, mode) as tar_ref:
                         for member in tar_ref.getmembers():
                             if member.isfile():
                                                 f"Failed to clean up extracted file {extracted_file_path}: {e}")
                 except tarfile.TarError as e:
                     logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
+            elif archive_extension == '.gz': # This case is handled by tarfile, but added for single .gz files
+                 extracted_name = archive_path.stem
+                 extracted_path = extract_to / extracted_name
+                 try:
+                     with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
+                         outfile.write(gz_file.read())
+                     if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(
+                             extracted_path):
+                         dataset.extend(self._process_single_file(extracted_path))
+                     elif extracted_path.suffix.lower() in self.archive_extensions:
+                         logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
+                         dataset.extend(self._process_archive(extracted_path, extract_to))
+                     else:
+                         logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
+                 except gzip.GzipFile as e:
+                     logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
+                 except Exception as e:
+                     logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
+                 finally:
+                     if extracted_path.exists():
+                         try:
+                             extracted_path.unlink()
+                         except OSError as e:
+                             logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
             elif archive_extension in ('.bz2', '.7z', '.rar'):
                 logger.warning(
                     f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
             filter_match = re.search(
                 r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
                 r'(?:where|by|for|with|if)\s+'
+                r'([\w\._-]+)\s+'  # Allow underscores, periods, and hyphens in column names
                 r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
+                r'([\'"]?[\w\s\.-]+[\'"]?)',
                 lower_message
             )
             if filter_match:
                     response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
                     new_filtered_df_state = None
                 else:
+                    df_to_filter = df.copy() # Always filter from the full dataframe
                     try:
+                        target_value: Any = None
+                        col_dtype = df_to_filter[column_name].dtype
+                        is_numeric_op = operator in ['>', '>=', '<', '<=', '==', '!=']
+                        is_numeric_col = pd.api.types.is_numeric_dtype(col_dtype)
+                        if is_numeric_op and is_numeric_col:
                             try:
                                 target_value = float(value_str)
                                 col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
                             except ValueError:
                                 response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
                         elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
                             target_value = value_str.lower() == 'true'
                             col_series = df_to_filter[column_name].astype(bool, errors='ignore')
+                        else: # Treat as string
                             target_value = str(value_str)
                             col_series = df_to_filter[column_name].astype(str).str.lower()
                             value_str_lower = target_value.lower()
+                        if not response: # No error so far
+                            condition = None
                             if operator in ['is', 'equals', '==']:
+                                if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
                                     condition = col_series == target_value
                                 else:
                                     condition = col_series == value_str_lower
                             elif operator == '!=':
+                                if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
                                     condition = col_series != target_value
                                 else:
                                     condition = col_series != value_str_lower
+                            elif operator == '>' and is_numeric_col:
                                 condition = col_series > target_value
+                            elif operator == '>=' and is_numeric_col:
                                 condition = col_series >= target_value
+                            elif operator == '<' and is_numeric_col:
                                 condition = col_series < target_value
+                            elif operator == '<=' and is_numeric_col:
                                 condition = col_series <= target_value
+                            elif operator in ['contains', 'contain']:
+                                condition = df_to_filter[column_name].astype(str).str.contains(value_str, case=False, na=False)
+                            elif operator == 'starts with':
+                                condition = df_to_filter[column_name].astype(str).str.startswith(value_str, case=False, na=False)
+                            elif operator == 'ends with':
+                                condition = df_to_filter[column_name].astype(str).str.endswith(value_str, case=False, na=False)
                             else:
                                 response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
                             if condition is not None:
                                 filtered_results_df = df_to_filter[condition]
                                 if not filtered_results_df.empty:
                                         f"Here's a preview:\n```\n{preview_str}\n```\n"
                                         f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
                                 else:
+                                    new_filtered_df_state = pd.DataFrame() # Empty dataframe
                                     response = f"No items found where '{column_name}' {operator} '{value_str}'."
                     except ValueError as ve:
                         response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
                         new_filtered_df_state = None
                                 processing_status_messages.append(f"✅ Processed URL: {url} (Level 0)")
                                 if content_result.get('processing_notes'):
                                     processing_status_messages.append(
+                                        f"  Notes: {'; '.join(content_result['processing_notes'])}")
                                 if content_result.get('linked_extractions'):
                                     num_linked_processed = len([r for r in content_result['linked_extractions'] if
                                                                 r and r.get('fetch_result') is not None])
                                     processing_status_messages.append(
+                                        f"  Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
                             else:
                                 processing_status_messages.append(f"❌ Failed to process URL: {url}")
                                 if content_result.get('processing_notes'):
                                     processing_status_messages.append(
+                                        f"  Notes: {'; '.join(content_result['processing_notes'])}")
                         else:
                             processing_status_messages.append(
                                 f"❌ Failed to process URL: {url} (No result returned)")
                             for res in file_results:
                                 if res.get('processing_notes'):
                                     processing_status_messages.append(
+                                        f"  Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
                         else:
                             processing_status_messages.append(f"❌ Failed to process file: {file.name}")
                 qr_paths = []