diff --git "a/app2.py" "b/app2.py" --- "a/app2.py" +++ "b/app2.py" @@ -818,113 +818,20 @@ class EnhancedFileProcessor: json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':')) total_length = len(json_str.encode('utf-8')) # Get actual byte length for QR capacity - # The overhead needs to be dynamic. The maximum capacity of QR code - # is in *bytes*. So we need to encode the JSON and measure its length. - # A typical QR code can hold up to 2953 bytes (Version 40-L, Alphanumeric). - # For UTF-8, it's roughly 2953 * 0.7 = ~2000 characters if many non-ASCII. - # Let's use 2000 characters as a conservative estimate for max_size. - # However, the qrcode library will auto-select version based on *bytes* and error correction. - # So, our `max_size` (which is in bytes) should be the *maximum possible byte capacity*. - # Let's use a standard maximum byte capacity for QR code Version 40-L (error correction M). - # Max capacity for byte mode, Version 40-L: 2953 bytes. - # We will try to fit as much as possible, but need to subtract for our metadata. - - # Estimate overhead for metadata like {"idx": 0, "tc": 1, "tl": 12345, "hash": 1234567890, "data": ""} - # and for the "{startX}" and "{endX}" tags. - # `{start}` and `{end}` tags. - # Max idx could be e.g. 999 if many chunks, so 11 chars for {start999} and {end999}. - # Let's assume a generous overhead for the structural JSON + sequence tags. - # A typical metadata JSON string might be ~60-80 bytes. Tags add ~20 bytes. - # Let's target a safe `effective_chunk_size` of about 2800 bytes for data content. - - effective_max_qr_data_bytes = 2800 # A conservative estimate for actual data payload per QR after metadata - - # Calculate the number of chunks based on byte length - # The JSON object for each chunk will contain metadata + a portion of the original data. - # Example chunk JSON: {"idx":X,"tc":Y,"tl":Z,"hash":H,"data":""} - # The total string for the QR code will be "{startX}{}{endX}" - - # We need to calculate the maximum length of the string `""` - # such that when wrapped with `{startX}` and `{endX}`, it fits within `max_size` (2953). - - # Let's estimate the size of the *metadata* part of the chunk JSON, assuming 100 total chunks (max 3 digits for idx, tc). - # "{idx":99,"tc":99,"tl":999999,"hash":4294967295,"data":""}" -> roughly 60-70 bytes. - # Plus {startXXX} and {endXXX} -> 11 chars each. So, 22 chars + ~70 bytes for json metadata = ~92 bytes. - # This is complex because the `hash` and `tl` can vary in length. - # Let's fix a `base_metadata_size` and allocate the rest to `data`. - base_metadata_size_estimate = len(json.dumps({"idx": 999, "tc": 999, "tl": 99999999999, "hash": 4294967295, "data": ""}, separators=(',', ':'))) # ~70-80 bytes - - # The actual content for the QR code will be "{startX}{}{endX}" - # The length of "{startX}" and "{endX}" depends on X. Max X can be 999. - # So, len("{start999}") = 10 characters. - # Total tag overhead per chunk: 10 + 10 = 20 characters. - # Let's assume 1 byte per character for these tags for simplicity, as they are ASCII. - - # Max size of data *including our JSON wrapper and start/end tags* should be `max_size` (2953 bytes) - # Let's refine the effective chunk size: - # effective_chunk_size_for_data = max_size - (base_metadata_size_estimate + 20) - # This is still tricky because `hash` depends on `chunk_data_str`. - # A simpler, more reliable approach: calculate how many characters of the *original* `json_str` can fit. - - # Let's simplify the max_size for internal data: use 2000 characters (bytes for UTF-8) as a safe maximum for payload data. - # This accounts for the variable nature of UTF-8 characters and QR capacity. - # Max capacity for QR code (Version 40, Error M): 2953 bytes. - # Max characters that can be encoded in UTF-8: ~2000 (if mostly ASCII, more like 2953 chars). - # To be safe and ensure it fits, we'll aim for a character limit lower than the byte limit. - max_chars_per_qr = 1800 # Safe character count to fit within QR code, considering JSON structure and UTF-8 - - # Calculate approximate effective size for the original JSON string *portion* - # The full string for the QR will be: "{startX}{"idx":...,"data":"chunk_of_original_json"}{endX}" - # We need to account for: - # - `{startX}` and `{endX}` tags (approx 20 chars) - # - `{"idx":X,"tc":Y,"tl":Z,"hash":H,"data":""}` (approx 70 chars) - # Total overhead per QR code: ~90 characters. - # So, characters available for `chunk_of_original_json` = `max_chars_per_qr` - 90 = 1710 characters. - # This is an approximate, as hash and total_length can vary. - - # To make it robust, we will iterate and subtract available space. - # Let's try to fit as much of the original JSON string as possible into each QR. - # The QR code data will be a JSON string like: - # {"idx": i, "tc": num_chunks, "tl": total_length, "hash": chunk_hash, "data": "original_json_slice"} - # This entire dictionary will then be prefixed/suffixed. - - # The actual byte capacity of QR codes depends on version, error correction, and mode. - # For "Byte" mode, Version 40-L is 2953 bytes. - # The `qrcode` library auto-selects the smallest version. - # We'll work with `json_str` as UTF-8 bytes to be precise. - - json_bytes = json_str.encode('utf-8') - total_bytes_length = len(json_bytes) - - # Max bytes for data payload inside the chunk_dict (e.g., "original_json_slice") - # We need to calculate the overhead for the chunk metadata *itself* + start/end tags. - # Let's take a fixed max QR capacity (V40-L) of 2953 bytes for now. - # Then estimate the *maximum possible overhead* for the wrapper JSON and tags. - # Max overhead for `{"idx":999,"tc":999,"tl":99999999999,"hash":4294967295,"data":""}` (approx 70-80 bytes) - # Plus tags `{start999}` `{end999}` (20 bytes). Total overhead ~100 bytes. - # So, `effective_payload_bytes_per_qr` = 2953 - 100 = 2853 bytes. - # This estimate is critical. If it's too high, QR generation will fail; too low, too many QRs. - # A more accurate way: try to generate a QR with a very small payload and full metadata, see max capacity. - - # Let's go with a fixed `MAX_QR_CODE_BYTE_CAPACITY = 2953` (V40-L) as the total bytes a QR can hold. - # And `MAX_OVERHEAD_PER_CHUNK_BYTES` = 100. - # This means `MAX_DATA_PAYLOAD_BYTES_PER_CHUNK` = `MAX_QR_CODE_BYTE_CAPACITY` - `MAX_OVERHEAD_PER_CHUNK_BYTES` - # = 2953 - 100 = 2853 bytes. - - MAX_QR_CODE_BYTE_CAPACITY = 2953 # Version 40, Error Correction M, Byte mode - # Max possible length for tags {start}{end} and the meta fields - # Example: {start12345}{"idx":12344,"tc":12345,"tl":999999999,"hash":1234567890,"data":""}{end12345} - # Max idx/tc up to 5 digits means {start99999} is 12 chars. So 24 chars for tags. - # Min length of meta JSON (empty data): {"idx":0,"tc":1,"tl":0,"hash":0,"data":""} is ~50 chars. - # Max length of meta JSON (large numbers, empty data): {"idx":10000,"tc":10000,"tl":10000000000,"hash":4294967295,"data":""} is ~80 chars. - # Total maximum overhead estimate: 24 (tags) + 80 (meta) = 104 bytes. - # Let's use 120 bytes as a safe, generous overhead. - MAX_OVERHEAD_PER_CHUNK_BYTES = 120 - - effective_payload_bytes_per_chunk = MAX_QR_CODE_BYTE_CAPACITY - MAX_OVERHEAD_PER_CHUNK_BYTES + # MAX_QR_CODE_BYTE_CAPACITY = 2953 # Version 40, Error Correction M, Byte mode + # Increased MAX_OVERHEAD_PER_CHUNK_BYTES to ensure fitting + # A conservative estimate considering variable lengths of idx, tc, tl, hash, and start/end tags + MAX_OVERHEAD_PER_CHUNK_BYTES = 250 # Increased from 120, to be very safe + + # Let's target a slightly lower capacity than max_size to ensure it fits comfortably, + # especially for the auto-versioning of the `qrcode` library. + # Version 40-L is 2953 bytes. Let's aim for a practical max of 2900 bytes for our content. + PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY = 2900 + + effective_payload_bytes_per_chunk = PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY - MAX_OVERHEAD_PER_CHUNK_BYTES if effective_payload_bytes_per_chunk <= 0: - logger.error(f"Max QR size ({MAX_QR_CODE_BYTE_CAPACITY}) is too small for metadata overhead ({MAX_OVERHEAD_PER_CHUNK_BYTES}). Cannot chunk.") + logger.error(f"Effective payload size is zero or negative. QR size ({PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY}) is too small for metadata overhead ({MAX_OVERHEAD_PER_CHUNK_BYTES}). Cannot chunk.") return [] # Calculate number of chunks based on the original data's byte length @@ -956,14 +863,15 @@ class EnhancedFileProcessor: final_qr_string = f"{{start{i+1}}}{inner_json_string}{{end{i+1}}}" # Double check if the final_qr_string actually fits. - # This is a critical check for robustness, but might slow down if data is very large. - # For now, rely on our calculation based on fixed max capacity. - # If final_qr_string.encode('utf-8') > MAX_QR_CODE_BYTE_CAPACITY, then our estimates are off. - if len(final_qr_string.encode('utf-8')) > MAX_QR_CODE_BYTE_CAPACITY: - logger.warning(f"Chunk {i+1} exceeds estimated QR capacity. Calculated: {len(final_qr_string.encode('utf-8'))} bytes, Max: {MAX_QR_CODE_BYTE_CAPACITY} bytes. Adjusting MAX_OVERHEAD_PER_CHUNK_BYTES might be needed.") - # As a fallback, we can try to reduce the chunk_data_str length, - # but this means recalculating. For now, log warning and continue. - # A more robust solution might dynamically adjust effective_payload_bytes_per_chunk if this happens. + # This is a critical check for robustness. If this still fails, it means our overhead estimate is too low. + encoded_final_qr_string_len = len(final_qr_string.encode('utf-8')) + if encoded_final_qr_string_len > PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY: + logger.warning(f"Chunk {i+1} exceeds estimated QR capacity. Actual: {encoded_final_qr_string_len} bytes, Target Max: {PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY} bytes. Consider increasing MAX_OVERHEAD_PER_CHUNK_BYTES further.") + # In a production system, one might re-chunk here or raise an error. + # For now, we log and proceed, hoping the qrcode library can still find a higher version + # (though the error implies it can't go beyond 40). + # The error 'Invalid version (was 41, expected 1 to 40)' means even this PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY might be too generous if a chunk hits it exactly or exceeds it. + # The fix is to make `effective_payload_bytes_per_chunk` smaller, forcing more chunks but guaranteeing fit. chunks_for_qr.append(final_qr_string) current_byte_pos = end_byte_pos @@ -998,6 +906,8 @@ def generate_stylish_qr(data: Union[str, Dict], # including the {startN} and {endN} tags, and the inner JSON. if isinstance(data, dict): # This path should ideally not be taken if chunk_data always returns strings + # and is only called with the pre-formatted chunk string. + # Keeping it as a fallback, but the primary use case is `data` being a string here. qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':'))) else: qr.add_data(str(data)) @@ -1104,9 +1014,1911 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li logger.info(f"Generated {len(paths)} QR codes.") return paths - except Exception as e: - logger.error(f"QR code generation error: {e}") - return [] +# --- Chatbot Logic --- +def respond_to_chat( + message: str, + chat_history: List[Tuple[str, str]], + chatbot_data: Optional[List[Dict]], + # Add current_filtered_df_state as input, it will be updated and returned + current_filtered_df_state: Optional[pd.DataFrame] +) -> Tuple[List[Tuple[str, str]], List[Dict], Optional[pd.DataFrame]]: + """ + Responds to user chat messages based on the loaded JSON data. + Manages and returns the state of the filtered DataFrame. + """ + if chatbot_data is None or not chatbot_data: + chat_history.append((message, "Please process some data first using the other tabs before chatting.")) + return chat_history, chatbot_data, current_filtered_df_state # Return existing state + + chat_history.append((message, "")) + + response = "" + lower_message = message.lower().strip() + # Initialize new_filtered_df_state with the current state to preserve it unless a filter changes it + new_filtered_df_state = current_filtered_df_state + + try: + # Attempt to flatten the data structure for easier querying + flat_data = [] + def flatten_item(d, parent_key='', sep='_'): + items = [] + if isinstance(d, dict): + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, (dict, list)): + items.extend(flatten_item(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + elif isinstance(d, list): + for i, elem in enumerate(d): + if isinstance(elem, (dict, list)): + items.extend(flatten_item(elem, f'{parent_key}_{i}' if parent_key else str(i), sep=sep).items()) + else: + items.append((f'{parent_key}_{i}' if parent_key else str(i), elem)) # Handle lists of non-dicts + # Note: If the top-level chatbot_data is NOT a list of dicts, this flattening might need adjustment. + # Assuming chatbot_data is a list of results, where each result is a dict. + + return dict(items) + + # Process each top-level item in chatbot_data + for i, item in enumerate(chatbot_data): + if isinstance(item, dict): + # Flatten the 'extracted_data' part if it exists and is a dict/list + extracted_data_part = item.get('extracted_data') + if isinstance(extracted_data_part, (dict, list)): + flat_item_data = flatten_item(extracted_data_part, parent_key=f'item_{i}_extracted_data') + # Include some top-level metadata if useful + metadata_part = {k: v for k, v in item.items() if k not in ['extracted_data', 'raw_content', 'linked_extractions']} + flat_data.append({**metadata_part, **flat_item_data}) + else: + # If extracted_data is not dict/list, just include top-level keys + flat_data.append({k: v for k, v in item.items() if k != 'raw_content'}) # Exclude raw_content + + elif isinstance(item, list): + # If a top-level item is a list itself (less common for single file/URL results but possible), flatten it + flat_data.extend(flatten_item(item, parent_key=f'item_{i}')) + else: + # Handle cases where top-level item is not a dict or list + flat_data.append({f'item_{i}_value': item}) + + + df = None + if flat_data: + try: + df = pd.DataFrame(flat_data) + logger.debug(f"Created DataFrame with shape: {df.shape}") + logger.debug(f"DataFrame columns: {list(df.columns)}") + except Exception as e: + logger.warning(f"Could not create pandas DataFrame from processed data: {e}. Falling back to manual processing.") + df = None + + # --- Complex Queries and Analysis --- + + if df is not None: + # List available columns + if "what columns are available" in lower_message or "list columns" in lower_message: + response = f"The available columns in the data are: {', '.join(df.columns)}" + + # Describe a specific column + match = re.search(r'describe column (\w+)', lower_message) + if match: + column_name = match.group(1) + if column_name in df.columns: + description = df[column_name].describe().to_string() + response = f"Description for column '{column_name}':\n```\n{description}\n```" + else: + response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" + + + # How many unique values in a column? + match = re.search(r'how many unique values in (\w+)', lower_message) + if match: + column_name = match.group(1) + if column_name in df.columns: + unique_count = df[column_name].nunique() + response = f"There are {unique_count} unique values in the '{column_name}' column." + else: + response = f"I couldn't find a column named '{column_name}' in the data. Available columns are: {', '.join(df.columns)}" + + # What is the average/sum/min/max of a numeric column? + match = re.search(r'what is the (average|sum|min|max) of (\w+)', lower_message) + if match: + operation, column_name = match.groups() + if column_name in df.columns: + try: + numeric_col = pd.to_numeric(df[column_name], errors='coerce') + numeric_col = numeric_col.dropna() + + if not numeric_col.empty: + if operation == 'average': + result = numeric_col.mean() + response = f"The average of '{column_name}' is {result:.2f}." + elif operation == 'sum': + result = numeric_col.sum() + response = f"The sum of '{column_name}' is {result:.2f}." + elif operation == 'min': + result = numeric_col.min() + response = f"The minimum of '{column_name}' is {result}." + elif operation == 'max': + result = numeric_col.max() + response = f"The maximum of '{column_name}' is {result}." + else: + response = "I can calculate average, sum, min, or max." + else: + response = f"The column '{column_name}' does not contain numeric values that I can analyze." + except Exception as e: + response = f"An error occurred while calculating the {operation} of '{column_name}': {e}" + logger.error(f"Error calculating {operation} for column '{column_name}': {e}") + else: + response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" + + # Enhanced Filter data based on more complex conditions + # Regex to capture: "filter by status active", "show items where category is 'electronics'", "find entries where price > 100" + # It tries to capture: + # 1. column_name (e.g., category, status, price) + # 2. operator (e.g., is, equals, =, >, <, contains, starts with, ends with) - flexible operators + # 3. value (e.g., 'electronics', active, 100) - can be quoted or unquoted + filter_match = re.search( + r'(?:filter|show items|show me items|find entries|select items|get items)\s+' # Optional action phrases + r'(?:where|by|for|with|if)\s+' # Keyword indicating condition + r'(\w+)\s+' # Column name + r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+' # Operator + r'([\'"]?[\w\s.-]+[\'"]?)', # Value (allows spaces, dots, hyphens if quoted, or single words) + lower_message + ) + + if filter_match: + column_name, operator, value_str = filter_match.groups() + column_name = column_name.strip() + operator = operator.strip().lower() + value_str = value_str.strip().strip("'\"") + + logger.info(f"Filter request: Column='{column_name}', Operator='{operator}', Value='{value_str}'") + + if column_name not in df.columns: + response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" + new_filtered_df_state = None # Clear previous filter if column not found + else: + # IMPORTANT: Always filter from the original full dataframe 'df' + active_df_to_filter = df.copy() + try: + # Attempt to infer value type for comparison + target_value: Any + col_dtype = df[column_name].dtype + + # Check if current_filtered_df_state exists and is not empty, use it for filtering + # Otherwise, use the full df + df_to_filter = current_filtered_df_state if current_filtered_df_state is not None and not current_filtered_df_state.empty else df.copy() + + if pd.api.types.is_numeric_dtype(col_dtype) and operator in ['>', '>=', '<', '<=', '==', '!=']: + try: + target_value = float(value_str) + col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce') + except ValueError: + response = f"For numeric column '{column_name}', '{value_str}' is not a valid number." + target_value = None # Error case + elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']: + target_value = value_str.lower() == 'true' + col_series = df_to_filter[column_name].astype(bool, errors='ignore') + else: # Assume string comparison otherwise + target_value = str(value_str) + col_series = df_to_filter[column_name].astype(str).str.lower() # Case-insensitive for strings + value_str_lower = target_value.lower() + + + if 'response' not in locals(): # If no type conversion error occurred + if operator in ['is', 'equals', '==']: + if pd.api.types.is_numeric_dtype(col_dtype) or pd.api.types.is_bool_dtype(col_dtype): + condition = col_series == target_value + else: # String comparison + condition = col_series == value_str_lower + elif operator == '!=': + if pd.api.types.is_numeric_dtype(col_dtype) or pd.api.types.is_bool_dtype(col_dtype): + condition = col_series != target_value + else: # String comparison + condition = col_series != value_str_lower + elif operator == '>' and pd.api.types.is_numeric_dtype(col_dtype): + condition = col_series > target_value + elif operator == '>=' and pd.api.types.is_numeric_dtype(col_dtype): + condition = col_series >= target_value + elif operator == '<' and pd.api.types.is_numeric_dtype(col_dtype): + condition = col_series < target_value + elif operator == '<=' and pd.api.types.is_numeric_dtype(col_dtype): + condition = col_series <= target_value + elif operator in ['contains', 'contain'] and pd.api.types.is_string_dtype(col_series): + condition = col_series.str.contains(value_str_lower, case=False, na=False) + elif operator == 'starts with' and pd.api.types.is_string_dtype(col_series): + condition = col_series.str.startswith(value_str_lower, na=False) + elif operator == 'ends with' and pd.api.types.is_string_dtype(col_series): + condition = col_series.str.endswith(value_str_lower, na=False) + else: + response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})." + condition = None + # If operator was bad, response is set, clear filter state + if response: new_filtered_df_state = None + + + if condition is not None: + # Apply condition to the active_df_to_filter (which is a copy of the full df) + filtered_results_df = df_to_filter[condition] # Use df_to_filter here + if not filtered_results_df.empty: + new_filtered_df_state = filtered_results_df # Update state with new filter result + num_results = len(filtered_results_df) + preview_rows = min(num_results, 5) + preview_cols = min(len(filtered_results_df.columns), 5) + + preview_df = filtered_results_df.head(preview_rows).iloc[:, :preview_cols] + preview_str = preview_df.to_string(index=False) + + response = (f"Found {num_results} items where '{column_name}' {operator} '{value_str}'.\n" + f"Here's a preview:\n```\n{preview_str}\n```\n" + f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.") + else: + new_filtered_df_state = pd.DataFrame() # Store empty DF for "no results" + response = f"No items found where '{column_name}' {operator} '{value_str}'." + # If condition is None (e.g. bad operator) and response not already set by type check, set generic invalid op message. + elif not response: # Avoid overwriting specific error from type check + response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})." + new_filtered_df_state = None + + + except ValueError as ve: # Specifically catch ValueError for target_value conversion + response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}" + new_filtered_df_state = None # Clear on value error + logger.warning(f"ValueError during filter: {ve}") + except Exception as e: + new_filtered_df_state = None # Clear on other errors + response = f"An error occurred while applying the filter: {e}" + logger.error(f"Error applying filter (column='{column_name}', op='{operator}', val='{value_str}'): {e}") + # If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results) + + # --- End of Enhanced Filter Logic --- + + # If `response` is still empty, it means no filter query was matched by the filter_match regex. + # In this case, new_filtered_df_state (initialized from current_filtered_df_state) remains unchanged. + + + # Request structured output (e.g., as CSV or simplified JSON) + # This section should act on the *original* df unless specifically asked for filtered data export. + # The new download buttons handle filtered data export separately. + # Let's assume for now it acts on the original df, and a separate command would be needed for "export filtered data" + # If no filter query matched, and no other specific df query matched, + # then `response` might still be empty. `new_filtered_df_state` will be the same as `current_filtered_df_state`. + # The general queries below should not reset `new_filtered_df_state` unless it's a "clear" command. + + elif "output as csv" in lower_message or "export as csv" in lower_message: + if df is not None and not df.empty: + csv_output = df.to_csv(index=False) + response = f"Here is the data in CSV format:\n```csv\n{csv_output[:1000]}...\n```\n(Output truncated for chat display)" + else: + response = "There is no data available to output as CSV." + elif "output as json" in lower_message or "export as json" in lower_message: # Note: "export as json" is different from download buttons + if df is not None and not df.empty: + json_output = df.to_json(orient='records', indent=2) + response = f"Here is the data in JSON format:\n```json\n{json_output[:1000]}...\n```\n(Output truncated for chat display)" + else: + response = "There is no data available to output as JSON." + + # --- General Queries (if no DataFrame or specific query matched AND no filter was applied in this turn) --- + # These should not clear new_filtered_df_state unless it's a "clear" command. + if not response: # Only enter if no response has been generated by DataFrame/filter logic + if "how many items" in lower_message or "number of items" in lower_message: + if new_filtered_df_state is not None and not new_filtered_df_state.empty: + response = f"The currently filtered dataset has {len(new_filtered_df_state)} items. The original dataset has {len(df if df is not None else chatbot_data)} items." + elif df is not None: # Check df from original chatbot_data + response = f"There are {len(df)} top-level items in the processed data." + elif isinstance(chatbot_data, list): # Fallback if df creation failed but chatbot_data is list + response = f"There are {len(chatbot_data)} top-level items in the processed data (not in DataFrame)." + elif isinstance(chatbot_data, dict): + response = "The processed data is a single dictionary, not a list of items." + else: + response = "The processed data is not a standard list or dictionary structure." + + elif "what is the structure" in lower_message or "tell me about the data" in lower_message: + if new_filtered_df_state is not None and not new_filtered_df_state.empty: + response = f"The filtered data has columns: {', '.join(new_filtered_df_state.columns)}. " + if df is not None: + response += f"The original data has columns: {', '.join(df.columns)}." + else: + response += "Original data structure is not tabular." + elif df is not None: + response = f"The data is a table with {len(df)} rows and columns: {', '.join(df.columns)}." + elif isinstance(chatbot_data, list) and chatbot_data: + sample_item = chatbot_data[0] + response = f"The data is a list containing {len(chatbot_data)} items. The first item has the following top-level keys: {list(sample_item.keys())}." + elif isinstance(chatbot_data, dict): + response = f"The data is a dictionary with the following top-level keys: {list(chatbot_data.keys())}." + else: + response = "The processed data is not a standard list or dictionary structure that I can easily describe." + + # "show me" without a filter condition might be ambiguous. + # Let's assume it refers to the original data or provide guidance. + elif "show me" in lower_message or "get me" in lower_message or "extract" in lower_message: + # This specific 'show me' without 'where' should not trigger a filter or clear existing filter state. + # It's a general request for data, which is too broad. Guide the user. + response = "If you want to filter the data, please use a phrase like 'show me items where column_name is value'. If you want to see the raw data, consider using the download buttons." + + # --- Speculation about Modifications --- + elif "how can i modify" in lower_message or "how to change" in lower_message or "can i add" in lower_message or "can i remove" in lower_message: + response = "I cannot directly modify the data here, but I can tell you how you *could* modify it. What kind of change are you considering (e.g., adding an item, changing a value, removing a field)?" + elif "add a field" in lower_message or "add a column" in lower_message: + response = "To add a field (or column if the data is tabular), you would typically iterate through each item (or row) in the data and add the new key-value pair. For example, adding a 'status' field with a default value." + elif "change a value" in lower_message or "update a field" in lower_message: + response = "To change a value, you would need to identify the specific item(s) and the field you want to update. You could use a condition (like filtering) to find the right items and then assign a new value to the field." + elif "remove a field" in lower_message or "delete a column" in lower_message: + response = "To remove a field, you would iterate through each item and delete the specified key. Be careful, as this is irreversible." + elif "restructure" in lower_message or "change the format" in lower_message: + response = "Restructuring data involves transforming it into a different shape. This could mean flattening nested objects, grouping items, or pivoting data. This often requires writing custom code to map the old structure to the new one." + elif "what if i" in lower_message or "if i changed" in lower_message: + response = "Tell me what specific change you're contemplating, and I can speculate on the potential impact or how you might approach it programmatically." + + # --- General Conversation / Fallback --- + elif "hello" in lower_message or "hi" in lower_message: + response = random.choice(["Hello! How can I help you understand the processed data?", "Hi there! What's on your mind about this data?", "Hey! Ask me anything about the data you've loaded."]) + elif "thank you" in lower_message or "thanks" in lower_message: + response = random.choice(["You're welcome!", "Glad I could help.", "No problem! Let me know if you have more questions about the data."]) + elif "clear chat" in lower_message: # This should be caught by button, but as text too + chat_history = [] # Gradio handles this for the Chatbot component via button + response = "Chat history cleared." + new_filtered_df_state = None # Also clear filtered data on "clear chat" command by text + elif not response: # Fallback if nothing else matched + response = random.choice([ + "I can analyze the data you've processed. What would you like to know? Try asking to filter data, e.g., 'show items where status is active'.", + "Ask me about the number of items, the structure, or values of specific fields. You can also filter data.", + "I can perform basic analysis or filter the data. For example: 'filter by price > 100'.", + "Tell me what you want to extract or filter from the data. Use phrases like 'show items where ...'.", + "I'm equipped to filter your data. Try 'find entries where name contains widget'." + ]) + + except Exception as e: + logger.error(f"Chatbot runtime error: {e}") + response = f"An internal error occurred while processing your request: {e}" + response += "\nPlease try rephrasing your question or clear the chat history." + # On unexpected error, preserve the current_filtered_df_state rather than clearing or modifying it. + # new_filtered_df_state = current_filtered_df_state # This line is effectively already done by initialization + + if not response: # Final safety net for response, if it's somehow still empty + response = "I'm not sure how to respond to that. Please try rephrasing or ask for help on available commands." + + if chat_history and chat_history[-1][1] == "": # If last history entry is (user_msg, "") + chat_history[-1] = (chat_history[-1][0], response) + # else: # This case should ideally not be reached if chat_history.append((message, "")) is always called first. + # chat_history.append((message, response)) # Avoids duplicate user message if something went wrong + + return chat_history, chatbot_data, new_filtered_df_state + +# --- Gradio Interface Definition --- +def create_modern_interface(): + """Create a modern and visually appealing Gradio interface""" + + css = """ + /* Modern color scheme */ + :root { + --primary-color: #1a365d; + --secondary-color: #2d3748; + --accent-color: #4299e1; + --background-color: #f7fafc; + --success-color: #48bb78; + --error-color: #f56565; + --warning-color: #ed8936; + } + /* Container styling */ + .container { + max-width: 1200px; + margin: auto; + padding: 2rem; + background-color: var(--background-color); + border-radius: 1rem; + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); + } + /* Component styling */ + .input-container { + background-color: white; + padding: 1.5rem; + border-radius: 0.5rem; + border: 1px solid #e2e8f0; + margin-bottom: 1rem; + } + /* Button styling */ + .primary-button { + background-color: var(--primary-color); + color: white; + padding: 0.75rem 1.5rem; + border-radius: 0.375rem; + border: none; + cursor: pointer; + transition: all 0.2s; + } + .primary-button:hover { + background-color: var(--accent-color); + transform: translateY(-1px); + } + /* Status messages */ + .status { + padding: 1rem; + border-radius: 0.375rem; + margin: 1rem 0; + } + .status.success { background-color: #f0fff4; color: var(--success-color); } + .status.error { background-color: #fff5f5; color: var(--error-color); } + .status.warning { background-color: #fffaf0; color: var(--warning-color); } + /* Gallery styling */ + .gallery { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 1rem; + padding: 1rem; + background-color: white; + border-radius: 0.5rem; + border: 1px solid #e2e8f0; + } + .gallery img { + width: 100%; + height: auto; + border-radius: 0.375rem; + transition: transform 0.2s; + } + .gallery img:hover { + transform: scale(1.05); + } + /* QR Code Viewport Styling */ + .viewport-container { + display: grid; + gap: 0.5rem; + padding: 1rem; + background-color: white; + border-radius: 0.5rem; + border: 1px solid #e2e8f0; + margin-top: 1rem; + } + .viewport-item { + display: flex; + flex-direction: column; + align-items: center; + } + .viewport-item img { + width: 100%; + height: auto; + border-radius: 0.375rem; + transition: transform 0.2s; + max-width: 150px; + max-height: 150px; + } + """ + with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface: + interface.head += """ + + """ + with gr.Row(): + crawl_depth_slider = gr.Slider( + label="Crawl Depth", + minimum=0, + maximum=10, + value=0, + step=1, + interactive=True, + info="Select the maximum depth for crawling links (0-10)." + ) + + qr_code_paths = gr.State([]) + chatbot_data = gr.State(None) + gr.Markdown(""" + # 🌐 Advanced Data Processing & QR Code Generator + Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor. + """) + with gr.Tab("📝 URL Processing"): + url_input = gr.Textbox( + label="Enter URLs (comma or newline separated)", + lines=5, + placeholder="https://example1.com\nhttps://example2.com", + value="" + ) + with gr.Tab("📁 File Input"): + file_input = gr.File( + label="Upload Files", + file_types=None, + file_count="multiple" + ) + with gr.Tab("📋 JSON Input"): + text_input = gr.TextArea( + label="Direct JSON Input", + lines=15, + placeholder="Paste your JSON data here...", + value="" + ) + with gr.Row(): + example_btn = gr.Button("📝 Load Example", variant="secondary") + clear_btn = gr.Button("🗑️ Clear", variant="secondary") + with gr.Row(): + combine_data = gr.Checkbox( + label="Combine all data into sequence", + value=True, + info="Generate sequential QR codes for combined data" + ) + generate_qr_toggle = gr.Checkbox( + label="Generate QR Codes", + value=False, # Default to False as per task + info="Enable to generate QR codes for the processed data." + ) + process_btn = gr.Button( + "🔄 Process & Generate QR", + variant="primary" + ) + output_json = gr.JSON(label="Processed Data") + output_gallery = gr.Gallery( + label="Generated QR Codes", + columns=3, + height=400, + show_label=True + ) + output_text = gr.Textbox( + label="Processing Status", + interactive=False + ) + + with gr.Tab("🖼️ QR Code Viewport") as viewport_tab: + viewport_output = gr.HTML(label="QR Code Sequence Viewport") + enabled_qr_codes = gr.State([]) + + with gr.Tab("🤖 Chat with Data") as chat_tab: + chat_history = gr.State([]) + chatbot = gr.Chatbot(label="Data Chatbot") + filtered_chatbot_df_state = gr.State(None) # To store the filtered DataFrame + + with gr.Row(): + chat_input = gr.Textbox(label="Your Message", placeholder="Ask me about the processed data...") + send_msg_btn = gr.Button("Send") + with gr.Row(): + download_full_json_btn = gr.Button("Download Full JSON") + download_filtered_json_btn = gr.Button("Download Filtered JSON") + download_file_output = gr.File(label="Download Data", interactive=False) # For triggering download + clear_chat_btn = gr.Button("Clear Chat History") + + + def load_example(): + example = { + "type": "product_catalog", + "items": [ + { + "id": "123", + "name": "Premium Widget", + "description": "High-quality widget with advanced features", + "price": 299.99, + "category": "electronics", + "tags": ["premium", "featured", "new"] + }, + { + "id": "456", + "name": "Basic Widget", + "description": "Reliable widget for everyday use", + "price": 149.99, + "category": "electronics", + "tags": ["basic", "popular"] + } + ], + "metadata": { + "timestamp": datetime.now().isoformat(), + "version": "2.0", + "source": "example" + } + } + return json.dumps(example, indent=2) + + def clear_input(): + return "", None, "", None + + def update_viewport(paths, enabled_states): + if not paths: + return "

No QR codes generated yet.

" + + num_qr_codes = len(paths) + cols = math.ceil(math.sqrt(num_qr_codes)) + cols = max(1, min(cols, 6)) + + viewport_html = f'
' + + # If states are not yet initialized or out of sync, enable all by default + if enabled_states is None or len(enabled_states) != num_qr_codes or not enabled_states: + enabled_states = list(range(num_qr_codes)) # Initialize with all enabled + + for i, path in enumerate(paths): + is_enabled = i in enabled_states + border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;" + opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;" + viewport_html += f'
' + viewport_html += f'QR Code {i+1}' + viewport_html += f'' + viewport_html += '
' + viewport_html += '
' + + return viewport_html + + def process_inputs(urls, files, text, combine, crawl_depth, generate_qr_enabled): + """Process all inputs and generate QR codes based on toggle""" + results = [] + processing_status_messages = [] + + url_processor = EnhancedURLProcessor() + file_processor = EnhancedFileProcessor() + + try: + if text and text.strip(): + try: + json_data = json.loads(text) + results.append({ + 'source': 'json_input', + 'extracted_data': json_data, + 'timestamp': datetime.now().isoformat(), + 'processing_notes': ['Parsed from direct JSON input.'] + }) + processing_status_messages.append("✅ Successfully parsed direct JSON input.") + except json.JSONDecodeError as e: + processing_status_messages.append(f"❌ Invalid JSON format in text input: {str(e)}") + except Exception as e: + processing_status_messages.append(f"❌ Error processing direct JSON input: {str(e)}") + + if urls and urls.strip(): + url_list = re.split(r'[,\n]', urls) + url_list = [url.strip() for url in url_list if url.strip()] + for url in url_list: + processing_status_messages.append(f"🌐 Processing URL: {url} with crawl depth {crawl_depth}...") + content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth) + if content_result: # Check if a result dictionary was returned + results.append(content_result) + if content_result.get('fetch_result') is not None: + processing_status_messages.append(f"✅ Processed URL: {url} (Level 0)") + if content_result.get('processing_notes'): + processing_status_messages.append(f" Notes: {'; '.join(content_result['processing_notes'])}") + + if content_result.get('linked_extractions'): + num_linked_processed = len([r for r in content_result['linked_extractions'] if r and r.get('fetch_result') is not None]) + processing_status_messages.append(f" Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.") + else: + processing_status_messages.append(f"❌ Failed to process URL: {url}") + if content_result.get('processing_notes'): + processing_status_messages.append(f" Notes: {'; '.join(content_result['processing_notes'])}") + else: + processing_status_messages.append(f"❌ Failed to process URL: {url} (No result returned)") + + + if files: + for file in files: + processing_status_messages.append(f"📁 Processing file: {file.name}...") + file_results = file_processor.process_file(file) + if file_results: + results.extend(file_results) + processing_status_messages.append(f"✅ Processed file: {file.name}") + for res in file_results: + if res.get('processing_notes'): + processing_status_messages.append(f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}") + else: + processing_status_messages.append(f"❌ Failed to process file: {file.name}") + + qr_paths = [] + final_json_output = None + qr_paths = [] + + if results: + final_json_output = results # Assign processed data regardless of QR generation + if generate_qr_enabled: + processing_status_messages.append("⚙️ Generating QR codes as requested...") + qr_paths = generate_qr_codes(results, combine) + if qr_paths: + processing_status_messages.append(f"✅ Successfully generated {len(qr_paths)} QR codes.") + else: + processing_status_messages.append("❌ Failed to generate QR codes (empty result or error). Check logs for details.)") + else: + processing_status_messages.append("☑️ QR code generation was disabled. Processed data is available.") + qr_paths = [] # Ensure it's empty + else: + processing_status_messages.append("⚠️ No valid content collected from inputs.") + final_json_output = {} # Ensure output_json is cleared if no results + + except Exception as e: + logger.error(f"Overall processing error in process_inputs: {e}") + processing_status_messages.append(f"❌ An unexpected error occurred during processing: {str(e)}") + + return ( + final_json_output, + [str(path) for path in qr_paths], + "\n".join(processing_status_messages), + final_json_output + ) + + def on_qr_generation(qr_paths_list): + if qr_paths_list is None: + num_qrs = 0 + else: + num_qrs = len(qr_paths_list) + + initial_enabled_states = list(range(num_qrs)) + return qr_paths_list, initial_enabled_states + + example_btn.click(load_example, inputs=[], outputs=text_input) + clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data]) + + process_btn.click( + process_inputs, + inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider, generate_qr_toggle], + outputs=[output_json, output_gallery, output_text, chatbot_data] + ).then( + on_qr_generation, + inputs=[output_gallery], # Pass the list of paths from output_gallery + outputs=[qr_code_paths, enabled_qr_codes] # Update qr_code_paths state and initial enabled_qr_codes state + ) + + viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output]) + + send_msg_btn.click( + respond_to_chat, + inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state], + outputs=[chatbot, chatbot_data, filtered_chatbot_df_state] + ).then( + lambda: "", + inputs=None, + outputs=chat_input + ) + + chat_input.submit( + respond_to_chat, + inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state], + outputs=[chatbot, chatbot_data, filtered_chatbot_df_state] + ).then( + lambda: "", + inputs=None, + outputs=chat_input + ) + + clear_chat_btn.click( + lambda: ([], None), # Clear chat history and filtered data state + inputs=None, + outputs=[chatbot, filtered_chatbot_df_state] + ) + + # --- Download Logic --- + def download_json_data(data_df: Optional[pd.DataFrame], filename_prefix: str) -> Optional[str]: + if data_df is None or data_df.empty: + logger.info(f"No data provided for download with prefix '{filename_prefix}'.") + return None + try: + data_list = data_df.to_dict(orient='records') + json_str = json.dumps(data_list, indent=2, ensure_ascii=False) + + timestamp = int(time.time()) + filename = f"{filename_prefix}_{timestamp}.json" + file_path = TEMP_DIR / filename + + with open(file_path, 'w', encoding='utf-8') as f: + f.write(json_str) + + logger.info(f"Successfully created JSON file for download: {file_path}") + return str(file_path) + except Exception as e: + logger.error(f"Error creating JSON file for {filename_prefix}: {e}") + return None + + def handle_download_full_json(current_chatbot_data_state: Optional[List[Dict]]) -> Optional[str]: + if not current_chatbot_data_state: + logger.info("No full data available to download.") + # Optionally, could return a gr.Warning or gr.Info to the UI if we had a dedicated status Textbox for downloads + return None + + # The chatbot_data state is a list of dicts. Convert to DataFrame for download_json_data. + # The df created in respond_to_chat is not directly used here to ensure we get the *original* full data. + try: + # A similar flattening logic as in respond_to_chat might be needed if chatbot_data_state is complex + # For now, assume it's a list of flat dictionaries or can be handled by pd.DataFrame directly. + # If chatbot_data_state originates from `results` in `process_inputs`, it's a list of dicts. + # A more robust approach would be to re-use the exact flattening from respond_to_chat if structures are nested. + # Let's try a direct conversion first. + df_to_download = pd.DataFrame(current_chatbot_data_state) + if df_to_download.empty: + logger.info("Full data resulted in an empty DataFrame. Nothing to download.") + return None + except Exception as e: + logger.error(f"Error converting full chatbot_data to DataFrame for download: {e}") + return None + + return download_json_data(df_to_download, "full_data") + + def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]: + if current_filtered_df_state is None or current_filtered_df_state.empty: + logger.info("No filtered data available to download.") + # Consider gr.Info("No filtered data to download.") if a text output for this is desired. + return None + return download_json_data(current_filtered_df_state, "filtered_data") + + download_full_json_btn.click( + fn=handle_download_full_json, + inputs=[chatbot_data], # chatbot_data is the gr.State holding the full dataset (List[Dict]) + outputs=[download_file_output] + ) + download_filtered_json_btn.click( + fn=handle_download_filtered_json, + inputs=[filtered_chatbot_df_state], # This state holds the filtered DataFrame + outputs=[download_file_output] + ) + + gr.Markdown(""" + ### 🚀 Features + - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth. **(Now performs real fetching)** + - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*). **(Now performs real file processing)** + - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs. + - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. **(Now performs real extraction)** + - **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification. + - **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item. + - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data, **including positional sequencing tags `{startN}` and `{endN}` in the QR code content**. + - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing. + - **Modern Design**: Clean, responsive interface with visual feedback. + - **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information. + ### 💡 Tips + 1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**. + 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats. + 3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure. + 4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries (`PyPDF2`, `python-docx`, `pyth`, `odfpy`). Check the console logs for warnings if a library is missing. + 5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item. **Remember to check the "Generate QR Codes" checkbox!** + 6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps. + 7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images. + 8. **Chatbot**: After processing data, go to the "Chat with Data" tab to ask questions about the JSON output. + ### ⚙️ QR Code Viewport Instructions + 1. Navigate to the **QR Code Viewport** tab after generating QR codes. + 2. The generated QR codes will be displayed in a grid based on their total count. + 3. Use the checkboxes below each QR code to enable or disable it for visual selection. Enabled codes have a green border and full opacity. + 4. This viewport is currently for visualization and selection *within the UI*; it doesn't change the generated files themselves. You would manually select which physical QR codes to scan based on this view. + """) + return interface + +def main(): + """Initialize and launch the application""" + try: + mimetypes.init() + interface = create_modern_interface() + interface.launch( + share=False, + debug=False, + show_error=True, + show_api=False + ) + except Exception as e: + logger.error(f"Application startup error: {e}") + print(f"\nFatal Error: {e}\nCheck the logs for details.") + raise + +if __name__ == "__main__": + main() +import json +import os +import re +import time +import logging +import mimetypes +import zipfile +import tempfile +import chardet +import io +import csv +import xml.etree.ElementTree as ET +from datetime import datetime +from typing import List, Dict, Optional, Union, Tuple, Any +from pathlib import Path +from urllib.parse import urlparse, urljoin +import requests +import validators +import gradio as gr +from diskcache import Cache +from bs4 import BeautifulSoup +from fake_useragent import UserAgent +from cleantext import clean +import qrcode +from PIL import Image, ImageDraw, ImageFont +import numpy as np +import tarfile +import gzip +import math +import random +import pandas as pd +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Setup enhanced logging with more detailed formatting +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('app.log', encoding='utf-8') + ]) +logger = logging.getLogger(__name__) + +# Conditional imports for document processing +try: + from PyPDF2 import PdfReader + PDF_SUPPORT = True +except ImportError: + PDF_SUPPORT = False + logger.warning("PyPDF2 not installed. PDF file processing will be limited.") + +try: + from docx import Document + DOCX_SUPPORT = True +except ImportError: + DOCX_SUPPORT = False + logger.warning("python-docx not installed. DOCX file processing will be limited.") + +try: + from pyth.plugins.plaintext.writer import PlaintextWriter + from pyth.plugins.rtf15.reader import Rtf15Reader + RTF_SUPPORT = True +except ImportError: + RTF_SUPPORT = False + logger.warning("pyth not installed. RTF file processing will be limited.") + +try: + from odf.opendocument import OpenDocumentText + from odf import text as odftext + ODT_SUPPORT = True +except ImportError: + ODT_SUPPORT = False + logger.warning("odfpy not installed. ODT file processing will be limited.") + +# Ensure output directories exist with modern structure +OUTPUTS_DIR = Path('output') +QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes' +TEMP_DIR = OUTPUTS_DIR / 'temp' +for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]: + directory.mkdir(parents=True, exist_ok=True) + +class EnhancedURLProcessor: + """Advanced URL processing with enhanced content extraction and recursive link following.""" + + def __init__(self): + # Use a real requests session with retry strategy + self.session = requests.Session() + retry_strategy = Retry( + total=3, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET"] + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + self.user_agent = UserAgent() + self.timeout = 15 # seconds + + def validate_url(self, url: str) -> Dict[str, Any]: + """Enhanced URL validation with accessibility check.""" + if not validators.url(url): + return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'} + + parsed = urlparse(url) + if not all([parsed.scheme, parsed.netloc]): + return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'} + + try: + # Use a HEAD request to check accessibility without downloading full content + headers = {'User-Agent': self.user_agent.random} + response = self.session.head(url, timeout=self.timeout, headers=headers, allow_redirects=True) + response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) + + # Check content type if available in HEAD response + content_type = response.headers.get('Content-Type', '').split(';')[0].strip() + if not content_type or not (content_type.startswith('text/') or 'json' in content_type or 'xml' in content_type): + # Basic check if content type seems relevant for text extraction + logger.warning(f"URL {url} returned potentially irrelevant content type: {content_type}") + # Decide if this should invalidate the URL or just add a note + # For now, we'll allow fetching but add a note. + + return { + 'is_valid': True, + 'message': 'URL is valid and accessible', + 'details': { + 'final_url': response.url, # Capture final URL after redirects + 'content_type': content_type, + 'server': response.headers.get('Server', 'N/A'), + 'size': response.headers.get('Content-Length', 'N/A') + } + } + except requests.exceptions.RequestException as e: + return {'is_valid': False, 'message': 'URL not accessible', 'details': str(e)} + except Exception as e: + logger.error(f"Unexpected error during URL validation for {url}: {e}") + return {'is_valid': False, 'message': 'Unexpected validation error', 'details': str(e)} + + + def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]: + """Enhanced content fetcher with retry mechanism and complete character extraction.""" + try: + logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1})") + headers = {'User-Agent': self.user_agent.random} + response = self.session.get(url, timeout=self.timeout, headers=headers, allow_redirects=True) + response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) + + final_url = response.url # Capture potential redirects + content_type = response.headers.get('Content-Type', '').split(';')[0].strip() + + # Attempt to detect encoding if not specified in headers + encoding = response.encoding # requests attempts to guess encoding + if encoding is None or encoding == 'ISO-8859-1': # Fallback if requests guess is default/uncertain + try: + encoding_detection = chardet.detect(response.content) + encoding = encoding_detection['encoding'] or 'utf-8' + logger.debug(f"Chardet detected encoding: {encoding} for {url}") + except Exception as e: + logger.warning(f"Chardet detection failed for {url}: {e}. Falling back to utf-8.") + encoding = 'utf-8' + + + raw_content = response.content.decode(encoding, errors='replace') + + # Extract metadata + metadata = { + 'original_url': url, + 'final_url': final_url, + 'timestamp': datetime.now().isoformat(), + 'detected_encoding': encoding, + 'content_type': content_type, + 'content_length': len(response.content), + 'headers': dict(response.headers), + 'status_code': response.status_code + } + + # Process based on content type + processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url) + + return { + 'source': 'url', + 'url': url, # Keep original URL as identifier for this step + 'raw_content': raw_content, + 'metadata': metadata, + 'extracted_data': processed_extraction['data'], + 'processing_notes': processed_extraction['notes'] + } + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch content from {url}: {e}") + return { + 'source': 'url', + 'url': url, + 'raw_content': None, + 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': getattr(e.response, 'status_code', None)}, + 'extracted_data': None, + 'processing_notes': [f"Failed to fetch content: {str(e)}"] + } + except Exception as e: + logger.error(f"Unexpected error while fetching or processing URL {url}: {e}") + return { + 'source': 'url', + 'url': url, + 'raw_content': raw_content if 'raw_content' in locals() else None, + 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, + 'extracted_data': None, + 'processing_notes': [f"Unexpected processing error: {str(e)}"] + } + + + def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]: + """Process content based on detected content type""" + lower_content_type = content_type.lower() + notes = [] + extracted_data: Any = None + try: + if 'text/html' in lower_content_type: + logger.debug(f"Processing HTML content from {base_url}") + extracted_data = self._process_html_content_enhanced(content, base_url) + notes.append("Processed as HTML") + elif 'application/json' in lower_content_type or 'text/json' in lower_content_type: + logger.debug(f"Processing JSON content from {base_url}") + try: + extracted_data = json.loads(content) + notes.append("Parsed as JSON") + except json.JSONDecodeError as e: + extracted_data = content + notes.append(f"Failed to parse as JSON: {e}") + logger.warning(f"Failed to parse JSON from {base_url}: {e}") + except Exception as e: + extracted_data = content + notes.append(f"Error processing JSON: {e}") + logger.error(f"Error processing JSON from {base_url}: {e}") + elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'): + logger.debug(f"Processing XML content from {base_url}") + try: + root = ET.fromstring(content) + xml_text = ET.tostring(root, encoding='unicode', method='xml') + extracted_data = xml_text + notes.append("Parsed as XML (text representation)") + except ET.ParseError as e: + extracted_data = content + notes.append(f"Failed to parse as XML: {e}") + logger.warning(f"Failed to parse XML from {base_url}: {e}") + except Exception as e: + extracted_data = content + notes.append(f"Error processing XML: {e}") + logger.error(f"Error processing XML from {base_url}: {e}") + elif 'text/plain' in lower_content_type or 'text/' in lower_content_type: + logger.debug(f"Processing Plain Text content from {base_url}") + extracted_data = content + notes.append("Processed as Plain Text") + else: + logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.") + extracted_data = content + notes.append(f"Unknown content type '{content_type}'. Stored raw text.") + except Exception as e: + logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}") + extracted_data = content + notes.append(f"Unexpected processing error: {e}. Stored raw text.") + return {'data': extracted_data, 'notes': notes} + + def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]: + """Process HTML content, preserving text, and extracting metadata and links.""" + extracted: Dict[str, Any] = { + 'title': None, + 'meta_description': None, + 'full_text': "", + 'links': [] + } + try: + soup = BeautifulSoup(content, 'html.parser') + + if soup.title and soup.title.string: + extracted['title'] = soup.title.string.strip() + + meta_desc = soup.find('meta', attrs={'name': 'description'}) + if meta_desc and meta_desc.get('content'): + extracted['meta_description'] = meta_desc['content'].strip() + + unique_links = set() + for a_tag in soup.find_all('a', href=True): + href = a_tag['href'].strip() + if href and not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): + text = a_tag.get_text().strip() + try: + absolute_url = urljoin(base_url, href) + if absolute_url not in unique_links: + extracted['links'].append({'text': text, 'url': absolute_url}) + unique_links.add(absolute_url) + except Exception: + if validators.url(href) and href not in unique_links: + extracted['links'].append({'text': text, 'url': href}) + unique_links.add(href) + elif urlparse(href).netloc and href not in unique_links: + extracted['links'].append({'text': text, 'url': href}) + unique_links.add(href) + + soup_copy = BeautifulSoup(content, 'html.parser') + for script_or_style in soup_copy(["script", "style"]): + script_or_style.extract() + text = soup_copy.get_text(separator='\n') + lines = text.splitlines() + cleaned_lines = [line.strip() for line in lines if line.strip()] + extracted['full_text'] = '\n'.join(cleaned_lines) + + except Exception as e: + logger.error(f"Enhanced HTML processing error for {base_url}: {e}") + soup_copy = BeautifulSoup(content, 'html.parser') + for script_or_style in soup_copy(["script", "style"]): + script_or_style.extract() + extracted['full_text'] = soup_copy.get_text(separator='\n').strip() + extracted['processing_error'] = f"Enhanced HTML processing failed: {e}" + + return extracted + + def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]: + """Fetches content from a URL and recursively follows links up to max_steps depth.""" + if not isinstance(max_steps, int) or not (0 <= max_steps <= 10): + logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10.") + return { + 'url': url, + 'level': 0, + 'fetch_result': None, + 'linked_extractions': [], + 'processing_notes': [f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10."] + } + + validation_result = self.validate_url(url) + if not validation_result['is_valid']: + logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}") + return { + 'url': url, + 'level': 0, + 'fetch_result': None, + 'linked_extractions': [], + 'processing_notes': [f"Initial URL validation failed: {validation_result['message']}"] + } + + # Use a set to keep track of visited URLs during the crawl to avoid infinite loops + visited_urls = set() + return self._fetch_content_recursive(url, max_steps, current_step=0, visited_urls=visited_urls) + + def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int, visited_urls: set) -> Dict[str, Any]: + """Recursive helper function to fetch content and follow links.""" + if current_step > max_steps: + logger.debug(f"Depth limit ({max_steps}) reached for {url} at level {current_step}.") + return { + 'url': url, + 'level': current_step, + 'fetch_result': None, + 'linked_extractions': [], + 'processing_notes': [f"Depth limit ({max_steps}) reached."] + } + + # Normalize URL before checking visited set + normalized_url = url.rstrip('/') # Simple normalization + + if normalized_url in visited_urls: + logger.debug(f"Skipping already visited URL: {url} at level {current_step}.") + return { + 'url': url, + 'level': current_step, + 'fetch_result': None, # Indicate not fetched in this run + 'linked_extractions': [], + 'processing_notes': ["URL already visited in this crawl."] + } + + visited_urls.add(normalized_url) # Mark as visited + + logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}") + fetch_result = self.fetch_content(url) + linked_extractions: List[Dict[str, Any]] = [] + + if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower(): + extracted_data = fetch_result['extracted_data'] + links = extracted_data.get('links', []) + + logger.info(f"Found {len(links)} potential links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.") + if current_step < max_steps: + for link_info in links: + linked_url = link_info.get('url') + if linked_url: + # Ensure linked URL is absolute and potentially within the same domain + # Simple same-domain check (can be made more sophisticated) + try: + base_domain = urlparse(url).netloc + linked_domain = urlparse(linked_url).netloc + if linked_domain and linked_domain != base_domain: + logger.debug(f"Skipping external link: {linked_url}") + continue # Skip external links + + # Recursively call for linked URLs + linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1, visited_urls) + if linked_result: + linked_extractions.append(linked_result) + except Exception as e: + logger.warning(f"Error processing linked URL {linked_url} from {url}: {e}") + + + current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed.'] + if f"Processed at level {current_step}" not in current_notes: + current_notes.append(f"Processed at level {current_step}") + + return { + 'url': url, + 'level': current_step, + 'fetch_result': fetch_result, + 'linked_extractions': linked_extractions, + 'processing_notes': current_notes + } + + +class EnhancedFileProcessor: + """Advanced file processing with enhanced content extraction""" + def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default + self.max_file_size = max_file_size + self.supported_extensions = { + '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', + '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', + '.pdf', '.doc', '.docx', '.rtf', '.odt', + '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', + } + self.archive_extensions = {'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'} + + def process_file(self, file) -> List[Dict]: + """Process uploaded file with enhanced error handling and complete extraction""" + if not file or not hasattr(file, 'name'): + logger.warning("Received invalid file object.") + return [] + + dataset = [] + file_path = Path(file.name) + + if not file_path.exists(): + logger.error(f"File path does not exist: {file_path}") + return [{ + 'source': 'file', + 'filename': file.name if hasattr(file, 'name') else 'unknown', + 'file_size': None, + 'extracted_data': None, + 'processing_notes': ['File path does not exist.'] + }] + + try: + file_size = file_path.stat().st_size + if file_size > self.max_file_size: + logger.warning(f"File '{file_path.name}' size ({file_size} bytes) exceeds maximum allowed size ({self.max_file_size} bytes).") + return [{ + 'source': 'file', + 'filename': file_path.name, + 'file_size': file_size, + 'extracted_data': None, + 'processing_notes': ['File size exceeds limit.'] + }] + + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + + if file_path.suffix.lower() in self.archive_extensions: + dataset.extend(self._process_archive(file_path, temp_dir_path)) + elif file_path.suffix.lower() in self.supported_extensions: + dataset.extend(self._process_single_file(file_path)) + else: + logger.warning(f"Unsupported file type for processing: '{file_path.name}'. Attempting to read as plain text.") + try: + content_bytes = file_path.read_bytes() + encoding_detection = chardet.detect(content_bytes) + encoding = encoding_detection['encoding'] or 'utf-8' + raw_content = content_bytes.decode(encoding, errors='replace') + dataset.append({ + 'source': 'file', + 'filename': file_path.name, + 'file_size': file_size, + 'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown', + 'extracted_data': {'plain_text': raw_content}, + 'processing_notes': ['Processed as plain text (unsupported extension).'] + }) + except Exception as e: + logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}") + dataset.append({ + 'source': 'file', + 'filename': file_path.name, + 'file_size': file_size, + 'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown', + 'extracted_data': None, + 'processing_notes': [f'Unsupported file type and failed to read as text: {e}'] + }) + + except Exception as e: + logger.error(f"Error processing file '{file_path.name}': {str(e)}") + dataset.append({ + 'source': 'file', + 'filename': file_path.name, + 'file_size': file_size if 'file_size' in locals() else None, + 'extracted_data': None, + 'processing_notes': [f'Overall file processing error: {str(e)}'] + }) + return dataset + + def _is_archive(self, filepath: Union[str, Path]) -> bool: + """Check if file is an archive""" + p = Path(filepath) if isinstance(filepath, str) else filepath + return p.suffix.lower() in self.archive_extensions + + def _process_single_file(self, file_path: Path) -> List[Dict]: + """Process a single file with enhanced character extraction and format-specific handling""" + dataset_entries = [] + filename = file_path.name + file_size = file_path.stat().st_size + mime_type, _ = mimetypes.guess_type(file_path) + mime_type = mime_type or 'unknown/unknown' + file_extension = file_path.suffix.lower() + + logger.info(f"Processing single file: '{filename}' ({mime_type}, {file_size} bytes)") + + raw_content: Optional[str] = None + extracted_data: Any = None + processing_notes: List[str] = [] + + try: + content_bytes = file_path.read_bytes() + encoding_detection = chardet.detect(content_bytes) + encoding = encoding_detection['encoding'] or 'utf-8' + raw_content = content_bytes.decode(encoding, errors='replace') + + is_explicit_json = mime_type == 'application/json' or file_extension == '.json' + looks_like_json = raw_content.strip().startswith('{') or raw_content.strip().startswith('[') + + if is_explicit_json or looks_like_json: + try: + extracted_data = json.loads(raw_content) + processing_notes.append("Parsed as JSON.") + if not is_explicit_json: + processing_notes.append("Note: Content looked like JSON despite extension/mime.") + logger.warning(f"File '{filename}' identified as JSON content despite extension/mime.") + mime_type = 'application/json' + except json.JSONDecodeError as e: + processing_notes.append(f"Failed to parse as JSON: {e}.") + if is_explicit_json: + logger.error(f"Explicit JSON file '{filename}' has invalid format: {e}") + else: + logger.warning(f"Content of '{filename}' looks like JSON but failed to parse: {e}") + except Exception as e: + processing_notes.append(f"Error processing JSON: {e}.") + logger.error(f"Error processing JSON in '{filename}': {e}") + + looks_like_xml = extracted_data is None and raw_content.strip().startswith('<') and raw_content.strip().endswith('>') + is_explicit_xml = extracted_data is None and (mime_type in ('application/xml', 'text/xml') or mime_type.endswith('+xml') or file_extension in ('.xml', '.xsd')) + + if extracted_data is None and (is_explicit_xml or looks_like_xml): + try: + root = ET.fromstring(raw_content) + extracted_data = ET.tostring(root, encoding='unicode', method='xml') + processing_notes.append("Parsed as XML (text representation).") + if not is_explicit_xml: + processing_notes.append("Note: Content looked like XML despite extension/mime.") + if 'xml' not in mime_type: mime_type = 'application/xml' + except ET.ParseError as e: + processing_notes.append(f"Failed to parse as XML: {e}.") + if is_explicit_xml: + logger.error(f"Explicit XML file '{filename}' has invalid format: {e}") + else: + logger.warning(f"Content of '{filename}' looks like XML but failed to parse: {e}") + except Exception as e: + processing_notes.append(f"Error processing XML: {e}.") + logger.error(f"Error processing XML in '{filename}': {e}") + + is_explicit_csv = extracted_data is None and (mime_type == 'text/csv' or file_extension == '.csv') + looks_like_csv = extracted_data is None and (',' in raw_content or ';' in raw_content) and ('\n' in raw_content or len(raw_content.splitlines()) > 1) + + if extracted_data is None and (is_explicit_csv or looks_like_csv): + try: + dialect = 'excel' + try: + sample = '\n'.join(raw_content.splitlines()[:10]) + if sample: + dialect = csv.Sniffer().sniff(sample).name + logger.debug(f"Sniffer detected CSV dialect: {dialect} for '{filename}'") + except csv.Error: + logger.debug(f"Sniffer failed to detect dialect for '{filename}', using 'excel'.") + dialect = 'excel' + + csv_reader = csv.reader(io.StringIO(raw_content), dialect=dialect) + rows = list(csv_reader) + + if rows: + max_rows_preview = 100 + extracted_data = { + 'headers': rows[0] if rows and rows[0] else None, + 'rows': rows[1:max_rows_preview+1] if len(rows) > 1 else [] + } + if len(rows) > max_rows_preview + 1: + processing_notes.append(f"CSV data rows truncated to {max_rows_preview}.") + processing_notes.append("Parsed as CSV.") + if not is_explicit_csv: + processing_notes.append("Note: Content looked like CSV despite extension/mime.") + mime_type = 'text/csv' + + else: + extracted_data = "Empty CSV" + processing_notes.append("Parsed as empty CSV.") + if not is_explicit_csv: + processing_notes.append("Note: Content looked like CSV but was empty.") + + except Exception as e: + processing_notes.append(f"Failed to parse as CSV: {e}.") + logger.warning(f"Failed to parse CSV from '{filename}': {e}") + + if extracted_data is None: + try: + extracted_text = None + if file_extension == '.pdf' and PDF_SUPPORT: + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: + tmp_file.write(content_bytes) + temp_path = Path(tmp_file.name) + try: + reader = PdfReader(temp_path) + text_content = "".join(page.extract_text() or "" for page in reader.pages) + extracted_text = text_content + processing_notes.append("Extracted text from PDF.") + finally: + if temp_path.exists(): temp_path.unlink() + elif file_extension == '.docx' and DOCX_SUPPORT: + with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file: + tmp_file.write(content_bytes) + temp_path = Path(tmp_file.name) + try: + document = Document(temp_path) + text_content = "\n".join(paragraph.text for paragraph in document.paragraphs) + extracted_text = text_content + processing_notes.append("Extracted text from DOCX.") + finally: + if temp_path.exists(): temp_path.unlink() + elif file_extension == '.rtf' and RTF_SUPPORT: + try: + doc = Rtf15Reader.read(io.StringIO(raw_content)) + text_content = PlaintextWriter.write(doc).getvalue() + extracted_text = text_content + processing_notes.append("Extracted text from RTF.") + except Exception as e: + processing_notes.append(f"RTF extraction error: {e}") + logger.warning(f"Failed to extract RTF text from '{filename}': {e}") + elif file_extension == '.odt' and ODT_SUPPORT: + with tempfile.NamedTemporaryFile(delete=False, suffix='.odt') as tmp_file: + tmp_file.write(content_bytes) + temp_path = Path(tmp_file.name) + try: + text_doc = OpenDocumentText(temp_path) + paragraphs = text_doc.getElementsByType(odftext.P) + text_content = "\n".join("".join(node.text for node in p.childNodes) for p in paragraphs) + extracted_text = text_content + processing_notes.append("Extracted text from ODT.") + finally: + if temp_path.exists(): temp_path.unlink() + elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']: + processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.") + logger.warning(f"Automatic text extraction for {file_extension.upper()} not fully implemented for '{filename}'.") + + if extracted_text is not None: + max_extracted_text_size = 10000 + extracted_data = {'text': extracted_text[:max_extracted_text_size]} + if len(extracted_text) > max_extracted_text_size: + extracted_data['text'] += "..." + processing_notes.append("Extracted text truncated.") + + except ImportError as e: + processing_notes.append(f"Missing dependency for document type ({e}). Cannot extract text.") + except Exception as e: + processing_notes.append(f"Error during document text extraction: {e}") + logger.warning(f"Error during document text extraction for '{filename}': {e}") + + if extracted_data is None: + extracted_data = {'plain_text': raw_content} + processing_notes.append("Stored as plain text.") + if mime_type in ['unknown/unknown', 'application/octet-stream']: + guessed_text_mime, _ = mimetypes.guess_type('dummy.txt') + if guessed_text_mime: mime_type = guessed_text_mime + + except Exception as e: + logger.error(f"Fatal error processing single file '{filename}': {e}") + processing_notes.append(f"Fatal processing error: {e}") + raw_content = None + extracted_data = None + + entry = { + 'source': 'file', + 'filename': filename, + 'file_size': file_size, + 'mime_type': mime_type, + 'created': datetime.fromtimestamp(file_path.stat().st_ctime).isoformat() if file_path.exists() else None, + 'modified': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat() if file_path.exists() else None, + 'raw_content': raw_content, + 'extracted_data': extracted_data, + 'processing_notes': processing_notes + } + + dataset_entries.append(entry) + return dataset_entries + + def _process_archive(self, archive_path: Path, extract_to: Path) -> List[Dict]: + """Process an archive file with enhanced extraction""" + dataset = [] + archive_extension = archive_path.suffix.lower() + logger.info(f"Processing archive: '{archive_path.name}'") + + try: + if archive_extension == '.zip': + if zipfile.is_zipfile(archive_path): + with zipfile.ZipFile(archive_path, 'r') as zip_ref: + for file_info in zip_ref.infolist(): + if file_info.file_size > 0 and not file_info.filename.endswith('/'): + sanitized_filename = Path(file_info.filename).name + extracted_file_path = extract_to / sanitized_filename + try: + with zip_ref.open(file_info) as zf, open(extracted_file_path, 'wb') as outfile: + outfile.write(zf.read()) + + if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path): + dataset.extend(self._process_single_file(extracted_file_path)) + elif extracted_file_path.suffix.lower() in self.archive_extensions: + logger.info(f"Found nested archive '{file_info.filename}', processing recursively.") + dataset.extend(self._process_archive(extracted_file_path, extract_to)) + else: + logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'") + except Exception as e: + logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}") + finally: + if extracted_file_path.exists(): + try: + extracted_file_path.unlink() + except OSError as e: + logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}") + else: + logger.error(f"'{archive_path.name}' is not a valid zip file.") + + elif archive_extension in ('.tar', '.gz', '.tgz'): + try: + mode = 'r' + if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz' + + with tarfile.open(archive_path, mode) as tar_ref: + for member in tar_ref.getmembers(): + if member.isfile(): + sanitized_filename = Path(member.name).name + extracted_file_path = extract_to / sanitized_filename + try: + if not str(extracted_file_path).startswith(str(extract_to)): + logger.warning(f"Skipping potentially malicious path in tar: {member.name}") + continue + + with tar_ref.extractfile(member) as tf, open(extracted_file_path, 'wb') as outfile: + if tf: + outfile.write(tf.read()) + else: + logger.warning(f"Could not extract file-like object for {member.name} from tar.") + continue + + if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path): + dataset.extend(self._process_single_file(extracted_file_path)) + elif extracted_file_path.suffix.lower() in self.archive_extensions: + logger.info(f"Found nested archive '{member.name}', processing recursively.") + dataset.extend(self._process_archive(extracted_file_path, extract_to)) + else: + logger.debug(f"Skipping unsupported file in archive: '{member.name}'") + except Exception as e: + logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}") + finally: + if extracted_file_path.exists(): + try: + extracted_file_path.unlink() + except OSError as e: + logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}") + + except tarfile.TarError as e: + logger.error(f"Error processing TAR archive '{archive_path.name}': {e}") + + elif archive_extension == '.gz': + extracted_name = archive_path.stem + extracted_path = extract_to / extracted_name + try: + with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile: + outfile.write(gz_file.read()) + if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_path): + dataset.extend(self._process_single_file(extracted_path)) + elif extracted_path.suffix.lower() in self.archive_extensions: + logger.info(f"Found nested archive '{extracted_name}', processing recursively.") + dataset.extend(self._process_archive(extracted_path, extract_to)) + else: + logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'") + + except gzip.GzipFile as e: + logger.error(f"Error processing GZIP file '{archive_path.name}': {e}") + except Exception as e: + logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}") + finally: + if extracted_path.exists(): + try: + extracted_path.unlink() + except OSError as e: + logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}") + + elif archive_extension in ('.bz2', '.7z', '.rar'): + logger.warning(f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.") + + except Exception as e: + logger.error(f"Overall archive processing error for '{archive_path.name}': {e}") + + return dataset + + def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[str]: + """ + Enhanced data chunking for QR codes with sequence metadata and start/end tags. + max_size is the maximum *byte* capacity for a QR code (e.g., 2953 bytes for Version 40-L). + """ + try: + json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':')) + total_length = len(json_str.encode('utf-8')) # Get actual byte length for QR capacity + + # MAX_QR_CODE_BYTE_CAPACITY = 2953 # Version 40, Error Correction M, Byte mode + # Increased MAX_OVERHEAD_PER_CHUNK_BYTES to ensure fitting + # A conservative estimate considering variable lengths of idx, tc, tl, hash, and start/end tags + MAX_OVERHEAD_PER_CHUNK_BYTES = 250 # Increased from 120, to be very safe + + # Let's target a slightly lower capacity than max_size to ensure it fits comfortably, + # especially for the auto-versioning of the `qrcode` library. + # Version 40-L is 2953 bytes. Let's aim for a practical max of 2900 bytes for our content. + PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY = 2900 + + effective_payload_bytes_per_chunk = PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY - MAX_OVERHEAD_PER_CHUNK_BYTES + + if effective_payload_bytes_per_chunk <= 0: + logger.error(f"Effective payload size is zero or negative. QR size ({PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY}) is too small for metadata overhead ({MAX_OVERHEAD_PER_CHUNK_BYTES}). Cannot chunk.") + return [] + + # Calculate number of chunks based on the original data's byte length + num_chunks = math.ceil(total_bytes_length / effective_payload_bytes_per_chunk) + if num_chunks == 0: # Handle empty input data + return [] + + chunks_for_qr: List[str] = [] + current_byte_pos = 0 + + for i in range(num_chunks): + # Determine the slice of the original JSON bytes + end_byte_pos = min(current_byte_pos + effective_payload_bytes_per_chunk, total_bytes_length) + chunk_data_bytes = json_bytes[current_byte_pos:end_byte_pos] + chunk_data_str = chunk_data_bytes.decode('utf-8', errors='replace') + + # Create the inner JSON structure for the chunk + chunk_dict = { + "idx": i + 1, # 1-based indexing for user readability + "tc": num_chunks, + "tl": total_bytes_length, # Total length in bytes + "hash": hash(chunk_data_bytes) & 0xFFFFFFFF, # Hash of the byte slice + "data": chunk_data_str + } + inner_json_string = json.dumps(chunk_dict, ensure_ascii=False, separators=(',', ':')) + + # Prepend {startN} and append {endN} tags + # Ensure N is fixed to the sequence number for rejoining. + final_qr_string = f"{{start{i+1}}}{inner_json_string}{{end{i+1}}}" + + # Double check if the final_qr_string actually fits. + # This is a critical check for robustness. If this still fails, it means our overhead estimate is too low. + encoded_final_qr_string_len = len(final_qr_string.encode('utf-8')) + if encoded_final_qr_string_len > PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY: + logger.warning(f"Chunk {i+1} exceeds estimated QR capacity. Actual: {encoded_final_qr_string_len} bytes, Target Max: {PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY} bytes. Consider increasing MAX_OVERHEAD_PER_CHUNK_BYTES further.") + # In a production system, one might re-chunk here or raise an error. + # For now, we log and proceed, hoping the qrcode library can still find a higher version + # (though the error implies it can't go beyond 40). + # The error 'Invalid version (was 41, expected 1 to 40)' means even this PRACTICAL_MAX_QR_CODE_BYTE_CAPACITY might be too generous if a chunk hits it exactly or exceeds it. + # The fix is to make `effective_payload_bytes_per_chunk` smaller, forcing more chunks but guaranteeing fit. + + chunks_for_qr.append(final_qr_string) + current_byte_pos = end_byte_pos + + if current_byte_pos < total_bytes_length: + logger.error(f"Chunking logic error: Only processed {current_byte_pos} of {total_bytes_length} bytes.") + return [] + + logger.info(f"Chunked data into {num_chunks} chunks for QR codes, with positional sequencing tags.") + return chunks_for_qr + + except Exception as e: + logger.error(f"Error chunking data: {e}") + return [] + +def generate_stylish_qr(data: Union[str, Dict], + filename: str, + size: int = 10, + border: int = 4, + fill_color: str = "#000000", + back_color: str = "#FFFFFF") -> str: + """Generate a stylish QR code with enhanced visual appeal""" + try: + qr = qrcode.QRCode( + version=None, # Let the library determine the best version for the data + error_correction=qrcode.constants.ERROR_CORRECT_M, # High error correction + box_size=size, + border=border + ) + + # `data` here is expected to be the pre-formatted string from chunk_data, + # including the {startN} and {endN} tags, and the inner JSON. + if isinstance(data, dict): + # This path should ideally not be taken if chunk_data always returns strings + # and is only called with the pre-formatted chunk string. + # Keeping it as a fallback, but the primary use case is `data` being a string here. + qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':'))) + else: + qr.add_data(str(data)) + + qr.make(fit=True) + + qr_image = qr.make_image(fill_color=fill_color, back_color=back_color) + qr_image = qr_image.convert('RGBA') + + try: + gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0)) + draw = ImageDraw.Draw(gradient) + for i in range(qr_image.width): + alpha = int(255 * (i/qr_image.width) * 0.05) + draw.line([(i, 0), (i, qr_image.height)], fill=(0, 0, 0, alpha)) + final_image = Image.alpha_composite(qr_image, gradient) + except Exception as e: + logger.warning(f"Failed to add gradient overlay to QR code: {e}. Using plain QR.") + final_image = qr_image + + output_path = QR_CODES_DIR / filename + final_image.save(output_path, quality=90) + + return str(output_path) + except Exception as e: + logger.error(f"QR generation error: {e}") + return "" + +def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]: + """Generate QR codes with enhanced visual appeal and metadata""" + # The `data` here should be the full processed dataset (List[Dict]) or a single item (Dict/str). + # The chunk_data method will handle turning this into strings suitable for QR codes. + + if not isinstance(data, (list, dict, str)): + logger.error("generate_qr_codes received data that is not a list, dict, or string.") + return [] + + try: + file_processor = EnhancedFileProcessor() + paths = [] + + if combined: + # When combined, we treat the entire `data` (which should be List[Dict]) as one large string + # to be chunked across multiple QRs. + chunks_of_combined_data = file_processor.chunk_data(data) + if not chunks_of_combined_data: + logger.warning("No chunks generated for combined data.") + return [] + for i, chunk_str in enumerate(chunks_of_combined_data): + # The filename now includes the chunk number within the sequence + # and total number of chunks. + filename = f'combined_qr_{i+1}_of_{len(chunks_of_combined_data)}_{int(time.time())}.png' + qr_path = generate_stylish_qr( + data=chunk_str, # This `chunk_str` already contains the {startN} and {endN} tags + filename=filename, + fill_color="#1a365d", + back_color="#ffffff" + ) + if qr_path: + paths.append(qr_path) + else: + logger.warning(f"Failed to generate QR for combined chunk {i+1}/{len(chunks_of_combined_data)}.") + else: + # If not combined, each top-level item in the data list is processed individually. + # Each individual item might itself be chunked into multiple QRs. + if isinstance(data, list): + for idx, item in enumerate(data): + item_chunks = file_processor.chunk_data(item) # Chunk each item + if not item_chunks: + logger.warning(f"No chunks generated for item {idx+1}.") + continue + for chunk_idx, chunk_str in enumerate(item_chunks): + filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(item_chunks)}_{int(time.time())}.png' + qr_path = generate_stylish_qr( + data=chunk_str, # This `chunk_str` already contains the {startN} and {endN} tags + filename=filename, + fill_color="#1a365d", + back_color="#ffffff" + ) + if qr_path: + paths.append(qr_path) + else: + logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(item_chunks)}.") + elif isinstance(data, (dict, str)): # Handle single dict/string inputs if not a list + single_item_chunks = file_processor.chunk_data(data) + if not single_item_chunks: + logger.warning("No chunks generated for single item.") + return [] + for chunk_idx, chunk_str in enumerate(single_item_chunks): + filename = f'single_item_chunk_{chunk_idx+1}_of_{len(single_item_chunks)}_{int(time.time())}.png' + qr_path = generate_stylish_qr( + data=chunk_str, + filename=filename, + fill_color="#1a365d", + back_color="#ffffff" + ) + if qr_path: + paths.append(qr_path) + else: + logger.warning(f"Failed to generate QR for single item chunk {chunk_idx+1}/{len(single_item_chunks)}.") + else: + logger.warning("Data is not a list, dict, or string and cannot be processed individually.") + + logger.info(f"Generated {len(paths)} QR codes.") + return paths # --- Chatbot Logic --- def respond_to_chat( @@ -1394,7 +3206,7 @@ def respond_to_chat( response = "There is no data available to output as JSON." # --- General Queries (if no DataFrame or specific query matched AND no filter was applied in this turn) --- - # These should not clear new_filtered_df_state unless it's a "clear chat" + # These should not clear new_filtered_df_state unless it's a "clear" command. if not response: # Only enter if no response has been generated by DataFrame/filter logic if "how many items" in lower_message or "number of items" in lower_message: if new_filtered_df_state is not None and not new_filtered_df_state.empty: @@ -1729,9 +3541,9 @@ def create_modern_interface(): viewport_html = f'
' - if enabled_states is None or len(enabled_states) != num_qr_codes: - # If states are not yet initialized or out of sync, enable all by default - enabled_states = list(range(num_qr_codes)) + # If states are not yet initialized or out of sync, enable all by default + if enabled_states is None or len(enabled_states) != num_qr_codes or not enabled_states: + enabled_states = list(range(num_qr_codes)) # Initialize with all enabled for i, path in enumerate(paths): is_enabled = i in enabled_states @@ -1818,7 +3630,7 @@ def create_modern_interface(): if qr_paths: processing_status_messages.append(f"✅ Successfully generated {len(qr_paths)} QR codes.") else: - processing_status_messages.append("❌ Failed to generate QR codes (empty result or error).") + processing_status_messages.append("❌ Failed to generate QR codes (empty result or error). Check logs for details.)") else: processing_status_messages.append("☑️ QR code generation was disabled. Processed data is available.") qr_paths = [] # Ensure it's empty @@ -1855,8 +3667,8 @@ def create_modern_interface(): outputs=[output_json, output_gallery, output_text, chatbot_data] ).then( on_qr_generation, - inputs=[output_gallery], - outputs=[qr_code_paths, enabled_qr_codes] + inputs=[output_gallery], # Pass the list of paths from output_gallery + outputs=[qr_code_paths, enabled_qr_codes] # Update qr_code_paths state and initial enabled_qr_codes state ) viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output]) @@ -1968,7 +3780,7 @@ def create_modern_interface(): 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats. 3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure. 4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries (`PyPDF2`, `python-docx`, `pyth`, `odfpy`). Check the console logs for warnings if a library is missing. - 5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item. + 5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item. **Remember to check the "Generate QR Codes" checkbox!** 6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps. 7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images. 8. **Chatbot**: After processing data, go to the "Chat with Data" tab to ask questions about the JSON output.