Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
|
@@ -809,63 +809,171 @@ class EnhancedFileProcessor:
|
|
| 809 |
|
| 810 |
return dataset
|
| 811 |
|
| 812 |
-
def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[
|
| 813 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 814 |
try:
|
| 815 |
json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
| 816 |
-
total_length = len(json_str)
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
return []
|
| 833 |
|
| 834 |
-
|
| 835 |
-
|
|
|
|
|
|
|
| 836 |
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
"tc": 1,
|
| 840 |
-
"tl": total_length,
|
| 841 |
-
"hash": hash(chunk_data) & 0xFFFFFFFF,
|
| 842 |
-
"data": chunk_data
|
| 843 |
-
}
|
| 844 |
-
return [chunk]
|
| 845 |
|
| 846 |
-
num_chunks = -(-total_length // effective_chunk_size)
|
| 847 |
-
chunks = []
|
| 848 |
-
current_pos = 0
|
| 849 |
for i in range(num_chunks):
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
|
|
|
|
|
|
|
|
|
| 855 |
"tc": num_chunks,
|
| 856 |
-
"tl":
|
| 857 |
-
"hash": hash(
|
| 858 |
"data": chunk_data_str
|
| 859 |
}
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
return []
|
| 866 |
|
| 867 |
-
logger.info(f"Chunked data into {num_chunks} chunks for QR codes.")
|
| 868 |
-
return
|
| 869 |
|
| 870 |
except Exception as e:
|
| 871 |
logger.error(f"Error chunking data: {e}")
|
|
@@ -880,13 +988,16 @@ def generate_stylish_qr(data: Union[str, Dict],
|
|
| 880 |
"""Generate a stylish QR code with enhanced visual appeal"""
|
| 881 |
try:
|
| 882 |
qr = qrcode.QRCode(
|
| 883 |
-
version=None,
|
| 884 |
-
error_correction=qrcode.constants.ERROR_CORRECT_M,
|
| 885 |
box_size=size,
|
| 886 |
border=border
|
| 887 |
)
|
| 888 |
|
|
|
|
|
|
|
| 889 |
if isinstance(data, dict):
|
|
|
|
| 890 |
qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
|
| 891 |
else:
|
| 892 |
qr.add_data(str(data))
|
|
@@ -917,8 +1028,11 @@ def generate_stylish_qr(data: Union[str, Dict],
|
|
| 917 |
|
| 918 |
def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
|
| 919 |
"""Generate QR codes with enhanced visual appeal and metadata"""
|
| 920 |
-
|
| 921 |
-
|
|
|
|
|
|
|
|
|
|
| 922 |
return []
|
| 923 |
|
| 924 |
try:
|
|
@@ -926,14 +1040,18 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
|
|
| 926 |
paths = []
|
| 927 |
|
| 928 |
if combined:
|
| 929 |
-
|
| 930 |
-
|
|
|
|
|
|
|
| 931 |
logger.warning("No chunks generated for combined data.")
|
| 932 |
return []
|
| 933 |
-
for i,
|
| 934 |
-
filename
|
|
|
|
|
|
|
| 935 |
qr_path = generate_stylish_qr(
|
| 936 |
-
data=
|
| 937 |
filename=filename,
|
| 938 |
fill_color="#1a365d",
|
| 939 |
back_color="#ffffff"
|
|
@@ -941,18 +1059,20 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
|
|
| 941 |
if qr_path:
|
| 942 |
paths.append(qr_path)
|
| 943 |
else:
|
| 944 |
-
logger.warning(f"Failed to generate QR for chunk {i+1}/{len(
|
| 945 |
else:
|
| 946 |
-
|
|
|
|
|
|
|
| 947 |
for idx, item in enumerate(data):
|
| 948 |
-
|
| 949 |
-
if not
|
| 950 |
logger.warning(f"No chunks generated for item {idx+1}.")
|
| 951 |
continue
|
| 952 |
-
for chunk_idx,
|
| 953 |
-
filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(
|
| 954 |
qr_path = generate_stylish_qr(
|
| 955 |
-
data=
|
| 956 |
filename=filename,
|
| 957 |
fill_color="#1a365d",
|
| 958 |
back_color="#ffffff"
|
|
@@ -960,9 +1080,26 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
|
|
| 960 |
if qr_path:
|
| 961 |
paths.append(qr_path)
|
| 962 |
else:
|
| 963 |
-
logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 964 |
else:
|
| 965 |
-
logger.warning("
|
| 966 |
|
| 967 |
logger.info(f"Generated {len(paths)} QR codes.")
|
| 968 |
return paths
|
|
@@ -973,11 +1110,11 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
|
|
| 973 |
|
| 974 |
# --- Chatbot Logic ---
|
| 975 |
def respond_to_chat(
|
| 976 |
-
message: str,
|
| 977 |
-
chat_history: List[Tuple[str, str]],
|
| 978 |
chatbot_data: Optional[List[Dict]],
|
| 979 |
# Add current_filtered_df_state as input, it will be updated and returned
|
| 980 |
-
current_filtered_df_state: Optional[pd.DataFrame]
|
| 981 |
) -> Tuple[List[Tuple[str, str]], List[Dict], Optional[pd.DataFrame]]:
|
| 982 |
"""
|
| 983 |
Responds to user chat messages based on the loaded JSON data.
|
|
@@ -992,7 +1129,7 @@ def respond_to_chat(
|
|
| 992 |
response = ""
|
| 993 |
lower_message = message.lower().strip()
|
| 994 |
# Initialize new_filtered_df_state with the current state to preserve it unless a filter changes it
|
| 995 |
-
new_filtered_df_state = current_filtered_df_state
|
| 996 |
|
| 997 |
try:
|
| 998 |
# Attempt to flatten the data structure for easier querying
|
|
@@ -1137,25 +1274,29 @@ def respond_to_chat(
|
|
| 1137 |
new_filtered_df_state = None # Clear previous filter if column not found
|
| 1138 |
else:
|
| 1139 |
# IMPORTANT: Always filter from the original full dataframe 'df'
|
| 1140 |
-
active_df_to_filter = df.copy()
|
| 1141 |
try:
|
| 1142 |
# Attempt to infer value type for comparison
|
| 1143 |
target_value: Any
|
| 1144 |
col_dtype = df[column_name].dtype
|
| 1145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1146 |
if pd.api.types.is_numeric_dtype(col_dtype) and operator in ['>', '>=', '<', '<=', '==', '!=']:
|
| 1147 |
try:
|
| 1148 |
target_value = float(value_str)
|
| 1149 |
-
col_series = pd.to_numeric(
|
| 1150 |
except ValueError:
|
| 1151 |
response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
|
| 1152 |
target_value = None # Error case
|
| 1153 |
elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
|
| 1154 |
target_value = value_str.lower() == 'true'
|
| 1155 |
-
col_series =
|
| 1156 |
else: # Assume string comparison otherwise
|
| 1157 |
target_value = str(value_str)
|
| 1158 |
-
col_series =
|
| 1159 |
value_str_lower = target_value.lower()
|
| 1160 |
|
| 1161 |
|
|
@@ -1193,16 +1334,16 @@ def respond_to_chat(
|
|
| 1193 |
|
| 1194 |
if condition is not None:
|
| 1195 |
# Apply condition to the active_df_to_filter (which is a copy of the full df)
|
| 1196 |
-
filtered_results_df =
|
| 1197 |
if not filtered_results_df.empty:
|
| 1198 |
new_filtered_df_state = filtered_results_df # Update state with new filter result
|
| 1199 |
num_results = len(filtered_results_df)
|
| 1200 |
preview_rows = min(num_results, 5)
|
| 1201 |
preview_cols = min(len(filtered_results_df.columns), 5)
|
| 1202 |
-
|
| 1203 |
preview_df = filtered_results_df.head(preview_rows).iloc[:, :preview_cols]
|
| 1204 |
preview_str = preview_df.to_string(index=False)
|
| 1205 |
-
|
| 1206 |
response = (f"Found {num_results} items where '{column_name}' {operator} '{value_str}'.\n"
|
| 1207 |
f"Here's a preview:\n```\n{preview_str}\n```\n"
|
| 1208 |
f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
|
|
@@ -1224,7 +1365,7 @@ def respond_to_chat(
|
|
| 1224 |
response = f"An error occurred while applying the filter: {e}"
|
| 1225 |
logger.error(f"Error applying filter (column='{column_name}', op='{operator}', val='{value_str}'): {e}")
|
| 1226 |
# If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results)
|
| 1227 |
-
|
| 1228 |
# --- End of Enhanced Filter Logic ---
|
| 1229 |
|
| 1230 |
# If `response` is still empty, it means no filter query was matched by the filter_match regex.
|
|
@@ -1589,6 +1730,7 @@ def create_modern_interface():
|
|
| 1589 |
viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'
|
| 1590 |
|
| 1591 |
if enabled_states is None or len(enabled_states) != num_qr_codes:
|
|
|
|
| 1592 |
enabled_states = list(range(num_qr_codes))
|
| 1593 |
|
| 1594 |
for i, path in enumerate(paths):
|
|
@@ -1753,14 +1895,14 @@ def create_modern_interface():
|
|
| 1753 |
try:
|
| 1754 |
data_list = data_df.to_dict(orient='records')
|
| 1755 |
json_str = json.dumps(data_list, indent=2, ensure_ascii=False)
|
| 1756 |
-
|
| 1757 |
timestamp = int(time.time())
|
| 1758 |
filename = f"{filename_prefix}_{timestamp}.json"
|
| 1759 |
file_path = TEMP_DIR / filename
|
| 1760 |
-
|
| 1761 |
with open(file_path, 'w', encoding='utf-8') as f:
|
| 1762 |
f.write(json_str)
|
| 1763 |
-
|
| 1764 |
logger.info(f"Successfully created JSON file for download: {file_path}")
|
| 1765 |
return str(file_path)
|
| 1766 |
except Exception as e:
|
|
@@ -1772,7 +1914,7 @@ def create_modern_interface():
|
|
| 1772 |
logger.info("No full data available to download.")
|
| 1773 |
# Optionally, could return a gr.Warning or gr.Info to the UI if we had a dedicated status Textbox for downloads
|
| 1774 |
return None
|
| 1775 |
-
|
| 1776 |
# The chatbot_data state is a list of dicts. Convert to DataFrame for download_json_data.
|
| 1777 |
# The df created in respond_to_chat is not directly used here to ensure we get the *original* full data.
|
| 1778 |
try:
|
|
@@ -1788,7 +1930,7 @@ def create_modern_interface():
|
|
| 1788 |
except Exception as e:
|
| 1789 |
logger.error(f"Error converting full chatbot_data to DataFrame for download: {e}")
|
| 1790 |
return None
|
| 1791 |
-
|
| 1792 |
return download_json_data(df_to_download, "full_data")
|
| 1793 |
|
| 1794 |
def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
|
|
@@ -1817,7 +1959,7 @@ def create_modern_interface():
|
|
| 1817 |
- **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. **(Now performs real extraction)**
|
| 1818 |
- **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification.
|
| 1819 |
- **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item.
|
| 1820 |
-
- **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data
|
| 1821 |
- **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
|
| 1822 |
- **Modern Design**: Clean, responsive interface with visual feedback.
|
| 1823 |
- **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information.
|
|
@@ -1855,4 +1997,4 @@ def main():
|
|
| 1855 |
raise
|
| 1856 |
|
| 1857 |
if __name__ == "__main__":
|
| 1858 |
-
main()
|
|
|
|
| 809 |
|
| 810 |
return dataset
|
| 811 |
|
| 812 |
+
def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[str]:
|
| 813 |
+
"""
|
| 814 |
+
Enhanced data chunking for QR codes with sequence metadata and start/end tags.
|
| 815 |
+
max_size is the maximum *byte* capacity for a QR code (e.g., 2953 bytes for Version 40-L).
|
| 816 |
+
"""
|
| 817 |
try:
|
| 818 |
json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
| 819 |
+
total_length = len(json_str.encode('utf-8')) # Get actual byte length for QR capacity
|
| 820 |
+
|
| 821 |
+
# The overhead needs to be dynamic. The maximum capacity of QR code
|
| 822 |
+
# is in *bytes*. So we need to encode the JSON and measure its length.
|
| 823 |
+
# A typical QR code can hold up to 2953 bytes (Version 40-L, Alphanumeric).
|
| 824 |
+
# For UTF-8, it's roughly 2953 * 0.7 = ~2000 characters if many non-ASCII.
|
| 825 |
+
# Let's use 2000 characters as a conservative estimate for max_size.
|
| 826 |
+
# However, the qrcode library will auto-select version based on *bytes* and error correction.
|
| 827 |
+
# So, our `max_size` (which is in bytes) should be the *maximum possible byte capacity*.
|
| 828 |
+
# Let's use a standard maximum byte capacity for QR code Version 40-L (error correction M).
|
| 829 |
+
# Max capacity for byte mode, Version 40-L: 2953 bytes.
|
| 830 |
+
# We will try to fit as much as possible, but need to subtract for our metadata.
|
| 831 |
+
|
| 832 |
+
# Estimate overhead for metadata like {"idx": 0, "tc": 1, "tl": 12345, "hash": 1234567890, "data": ""}
|
| 833 |
+
# and for the "{startX}" and "{endX}" tags.
|
| 834 |
+
# `{start<idx>}` and `{end<idx>}` tags.
|
| 835 |
+
# Max idx could be e.g. 999 if many chunks, so 11 chars for {start999} and {end999}.
|
| 836 |
+
# Let's assume a generous overhead for the structural JSON + sequence tags.
|
| 837 |
+
# A typical metadata JSON string might be ~60-80 bytes. Tags add ~20 bytes.
|
| 838 |
+
# Let's target a safe `effective_chunk_size` of about 2800 bytes for data content.
|
| 839 |
+
|
| 840 |
+
effective_max_qr_data_bytes = 2800 # A conservative estimate for actual data payload per QR after metadata
|
| 841 |
+
|
| 842 |
+
# Calculate the number of chunks based on byte length
|
| 843 |
+
# The JSON object for each chunk will contain metadata + a portion of the original data.
|
| 844 |
+
# Example chunk JSON: {"idx":X,"tc":Y,"tl":Z,"hash":H,"data":"<chunk_payload>"}
|
| 845 |
+
# The total string for the QR code will be "{startX}{<chunk_json_string>}{endX}"
|
| 846 |
+
|
| 847 |
+
# We need to calculate the maximum length of the string `"<chunk_json_string>"`
|
| 848 |
+
# such that when wrapped with `{startX}` and `{endX}`, it fits within `max_size` (2953).
|
| 849 |
+
|
| 850 |
+
# Let's estimate the size of the *metadata* part of the chunk JSON, assuming 100 total chunks (max 3 digits for idx, tc).
|
| 851 |
+
# "{idx":99,"tc":99,"tl":999999,"hash":4294967295,"data":""}" -> roughly 60-70 bytes.
|
| 852 |
+
# Plus {startXXX} and {endXXX} -> 11 chars each. So, 22 chars + ~70 bytes for json metadata = ~92 bytes.
|
| 853 |
+
# This is complex because the `hash` and `tl` can vary in length.
|
| 854 |
+
# Let's fix a `base_metadata_size` and allocate the rest to `data`.
|
| 855 |
+
base_metadata_size_estimate = len(json.dumps({"idx": 999, "tc": 999, "tl": 99999999999, "hash": 4294967295, "data": ""}, separators=(',', ':'))) # ~70-80 bytes
|
| 856 |
+
|
| 857 |
+
# The actual content for the QR code will be "{startX}{<chunk_payload_with_meta>}{endX}"
|
| 858 |
+
# The length of "{startX}" and "{endX}" depends on X. Max X can be 999.
|
| 859 |
+
# So, len("{start999}") = 10 characters.
|
| 860 |
+
# Total tag overhead per chunk: 10 + 10 = 20 characters.
|
| 861 |
+
# Let's assume 1 byte per character for these tags for simplicity, as they are ASCII.
|
| 862 |
+
|
| 863 |
+
# Max size of data *including our JSON wrapper and start/end tags* should be `max_size` (2953 bytes)
|
| 864 |
+
# Let's refine the effective chunk size:
|
| 865 |
+
# effective_chunk_size_for_data = max_size - (base_metadata_size_estimate + 20)
|
| 866 |
+
# This is still tricky because `hash` depends on `chunk_data_str`.
|
| 867 |
+
# A simpler, more reliable approach: calculate how many characters of the *original* `json_str` can fit.
|
| 868 |
+
|
| 869 |
+
# Let's simplify the max_size for internal data: use 2000 characters (bytes for UTF-8) as a safe maximum for payload data.
|
| 870 |
+
# This accounts for the variable nature of UTF-8 characters and QR capacity.
|
| 871 |
+
# Max capacity for QR code (Version 40, Error M): 2953 bytes.
|
| 872 |
+
# Max characters that can be encoded in UTF-8: ~2000 (if mostly ASCII, more like 2953 chars).
|
| 873 |
+
# To be safe and ensure it fits, we'll aim for a character limit lower than the byte limit.
|
| 874 |
+
max_chars_per_qr = 1800 # Safe character count to fit within QR code, considering JSON structure and UTF-8
|
| 875 |
+
|
| 876 |
+
# Calculate approximate effective size for the original JSON string *portion*
|
| 877 |
+
# The full string for the QR will be: "{startX}{"idx":...,"data":"chunk_of_original_json"}{endX}"
|
| 878 |
+
# We need to account for:
|
| 879 |
+
# - `{startX}` and `{endX}` tags (approx 20 chars)
|
| 880 |
+
# - `{"idx":X,"tc":Y,"tl":Z,"hash":H,"data":""}` (approx 70 chars)
|
| 881 |
+
# Total overhead per QR code: ~90 characters.
|
| 882 |
+
# So, characters available for `chunk_of_original_json` = `max_chars_per_qr` - 90 = 1710 characters.
|
| 883 |
+
# This is an approximate, as hash and total_length can vary.
|
| 884 |
+
|
| 885 |
+
# To make it robust, we will iterate and subtract available space.
|
| 886 |
+
# Let's try to fit as much of the original JSON string as possible into each QR.
|
| 887 |
+
# The QR code data will be a JSON string like:
|
| 888 |
+
# {"idx": i, "tc": num_chunks, "tl": total_length, "hash": chunk_hash, "data": "original_json_slice"}
|
| 889 |
+
# This entire dictionary will then be prefixed/suffixed.
|
| 890 |
+
|
| 891 |
+
# The actual byte capacity of QR codes depends on version, error correction, and mode.
|
| 892 |
+
# For "Byte" mode, Version 40-L is 2953 bytes.
|
| 893 |
+
# The `qrcode` library auto-selects the smallest version.
|
| 894 |
+
# We'll work with `json_str` as UTF-8 bytes to be precise.
|
| 895 |
+
|
| 896 |
+
json_bytes = json_str.encode('utf-8')
|
| 897 |
+
total_bytes_length = len(json_bytes)
|
| 898 |
+
|
| 899 |
+
# Max bytes for data payload inside the chunk_dict (e.g., "original_json_slice")
|
| 900 |
+
# We need to calculate the overhead for the chunk metadata *itself* + start/end tags.
|
| 901 |
+
# Let's take a fixed max QR capacity (V40-L) of 2953 bytes for now.
|
| 902 |
+
# Then estimate the *maximum possible overhead* for the wrapper JSON and tags.
|
| 903 |
+
# Max overhead for `{"idx":999,"tc":999,"tl":99999999999,"hash":4294967295,"data":""}` (approx 70-80 bytes)
|
| 904 |
+
# Plus tags `{start999}` `{end999}` (20 bytes). Total overhead ~100 bytes.
|
| 905 |
+
# So, `effective_payload_bytes_per_qr` = 2953 - 100 = 2853 bytes.
|
| 906 |
+
# This estimate is critical. If it's too high, QR generation will fail; too low, too many QRs.
|
| 907 |
+
# A more accurate way: try to generate a QR with a very small payload and full metadata, see max capacity.
|
| 908 |
+
|
| 909 |
+
# Let's go with a fixed `MAX_QR_CODE_BYTE_CAPACITY = 2953` (V40-L) as the total bytes a QR can hold.
|
| 910 |
+
# And `MAX_OVERHEAD_PER_CHUNK_BYTES` = 100.
|
| 911 |
+
# This means `MAX_DATA_PAYLOAD_BYTES_PER_CHUNK` = `MAX_QR_CODE_BYTE_CAPACITY` - `MAX_OVERHEAD_PER_CHUNK_BYTES`
|
| 912 |
+
# = 2953 - 100 = 2853 bytes.
|
| 913 |
+
|
| 914 |
+
MAX_QR_CODE_BYTE_CAPACITY = 2953 # Version 40, Error Correction M, Byte mode
|
| 915 |
+
# Max possible length for tags {start<idx>}{end<idx>} and the meta fields
|
| 916 |
+
# Example: {start12345}{"idx":12344,"tc":12345,"tl":999999999,"hash":1234567890,"data":""}{end12345}
|
| 917 |
+
# Max idx/tc up to 5 digits means {start99999} is 12 chars. So 24 chars for tags.
|
| 918 |
+
# Min length of meta JSON (empty data): {"idx":0,"tc":1,"tl":0,"hash":0,"data":""} is ~50 chars.
|
| 919 |
+
# Max length of meta JSON (large numbers, empty data): {"idx":10000,"tc":10000,"tl":10000000000,"hash":4294967295,"data":""} is ~80 chars.
|
| 920 |
+
# Total maximum overhead estimate: 24 (tags) + 80 (meta) = 104 bytes.
|
| 921 |
+
# Let's use 120 bytes as a safe, generous overhead.
|
| 922 |
+
MAX_OVERHEAD_PER_CHUNK_BYTES = 120
|
| 923 |
+
|
| 924 |
+
effective_payload_bytes_per_chunk = MAX_QR_CODE_BYTE_CAPACITY - MAX_OVERHEAD_PER_CHUNK_BYTES
|
| 925 |
+
|
| 926 |
+
if effective_payload_bytes_per_chunk <= 0:
|
| 927 |
+
logger.error(f"Max QR size ({MAX_QR_CODE_BYTE_CAPACITY}) is too small for metadata overhead ({MAX_OVERHEAD_PER_CHUNK_BYTES}). Cannot chunk.")
|
| 928 |
return []
|
| 929 |
|
| 930 |
+
# Calculate number of chunks based on the original data's byte length
|
| 931 |
+
num_chunks = math.ceil(total_bytes_length / effective_payload_bytes_per_chunk)
|
| 932 |
+
if num_chunks == 0: # Handle empty input data
|
| 933 |
+
return []
|
| 934 |
|
| 935 |
+
chunks_for_qr: List[str] = []
|
| 936 |
+
current_byte_pos = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 937 |
|
|
|
|
|
|
|
|
|
|
| 938 |
for i in range(num_chunks):
|
| 939 |
+
# Determine the slice of the original JSON bytes
|
| 940 |
+
end_byte_pos = min(current_byte_pos + effective_payload_bytes_per_chunk, total_bytes_length)
|
| 941 |
+
chunk_data_bytes = json_bytes[current_byte_pos:end_byte_pos]
|
| 942 |
+
chunk_data_str = chunk_data_bytes.decode('utf-8', errors='replace')
|
| 943 |
+
|
| 944 |
+
# Create the inner JSON structure for the chunk
|
| 945 |
+
chunk_dict = {
|
| 946 |
+
"idx": i + 1, # 1-based indexing for user readability
|
| 947 |
"tc": num_chunks,
|
| 948 |
+
"tl": total_bytes_length, # Total length in bytes
|
| 949 |
+
"hash": hash(chunk_data_bytes) & 0xFFFFFFFF, # Hash of the byte slice
|
| 950 |
"data": chunk_data_str
|
| 951 |
}
|
| 952 |
+
inner_json_string = json.dumps(chunk_dict, ensure_ascii=False, separators=(',', ':'))
|
| 953 |
+
|
| 954 |
+
# Prepend {startN} and append {endN} tags
|
| 955 |
+
# Ensure N is fixed to the sequence number for rejoining.
|
| 956 |
+
final_qr_string = f"{{start{i+1}}}{inner_json_string}{{end{i+1}}}"
|
| 957 |
+
|
| 958 |
+
# Double check if the final_qr_string actually fits.
|
| 959 |
+
# This is a critical check for robustness, but might slow down if data is very large.
|
| 960 |
+
# For now, rely on our calculation based on fixed max capacity.
|
| 961 |
+
# If final_qr_string.encode('utf-8') > MAX_QR_CODE_BYTE_CAPACITY, then our estimates are off.
|
| 962 |
+
if len(final_qr_string.encode('utf-8')) > MAX_QR_CODE_BYTE_CAPACITY:
|
| 963 |
+
logger.warning(f"Chunk {i+1} exceeds estimated QR capacity. Calculated: {len(final_qr_string.encode('utf-8'))} bytes, Max: {MAX_QR_CODE_BYTE_CAPACITY} bytes. Adjusting MAX_OVERHEAD_PER_CHUNK_BYTES might be needed.")
|
| 964 |
+
# As a fallback, we can try to reduce the chunk_data_str length,
|
| 965 |
+
# but this means recalculating. For now, log warning and continue.
|
| 966 |
+
# A more robust solution might dynamically adjust effective_payload_bytes_per_chunk if this happens.
|
| 967 |
+
|
| 968 |
+
chunks_for_qr.append(final_qr_string)
|
| 969 |
+
current_byte_pos = end_byte_pos
|
| 970 |
+
|
| 971 |
+
if current_byte_pos < total_bytes_length:
|
| 972 |
+
logger.error(f"Chunking logic error: Only processed {current_byte_pos} of {total_bytes_length} bytes.")
|
| 973 |
return []
|
| 974 |
|
| 975 |
+
logger.info(f"Chunked data into {num_chunks} chunks for QR codes, with positional sequencing tags.")
|
| 976 |
+
return chunks_for_qr
|
| 977 |
|
| 978 |
except Exception as e:
|
| 979 |
logger.error(f"Error chunking data: {e}")
|
|
|
|
| 988 |
"""Generate a stylish QR code with enhanced visual appeal"""
|
| 989 |
try:
|
| 990 |
qr = qrcode.QRCode(
|
| 991 |
+
version=None, # Let the library determine the best version for the data
|
| 992 |
+
error_correction=qrcode.constants.ERROR_CORRECT_M, # High error correction
|
| 993 |
box_size=size,
|
| 994 |
border=border
|
| 995 |
)
|
| 996 |
|
| 997 |
+
# `data` here is expected to be the pre-formatted string from chunk_data,
|
| 998 |
+
# including the {startN} and {endN} tags, and the inner JSON.
|
| 999 |
if isinstance(data, dict):
|
| 1000 |
+
# This path should ideally not be taken if chunk_data always returns strings
|
| 1001 |
qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
|
| 1002 |
else:
|
| 1003 |
qr.add_data(str(data))
|
|
|
|
| 1028 |
|
| 1029 |
def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
|
| 1030 |
"""Generate QR codes with enhanced visual appeal and metadata"""
|
| 1031 |
+
# The `data` here should be the full processed dataset (List[Dict]) or a single item (Dict/str).
|
| 1032 |
+
# The chunk_data method will handle turning this into strings suitable for QR codes.
|
| 1033 |
+
|
| 1034 |
+
if not isinstance(data, (list, dict, str)):
|
| 1035 |
+
logger.error("generate_qr_codes received data that is not a list, dict, or string.")
|
| 1036 |
return []
|
| 1037 |
|
| 1038 |
try:
|
|
|
|
| 1040 |
paths = []
|
| 1041 |
|
| 1042 |
if combined:
|
| 1043 |
+
# When combined, we treat the entire `data` (which should be List[Dict]) as one large string
|
| 1044 |
+
# to be chunked across multiple QRs.
|
| 1045 |
+
chunks_of_combined_data = file_processor.chunk_data(data)
|
| 1046 |
+
if not chunks_of_combined_data:
|
| 1047 |
logger.warning("No chunks generated for combined data.")
|
| 1048 |
return []
|
| 1049 |
+
for i, chunk_str in enumerate(chunks_of_combined_data):
|
| 1050 |
+
# The filename now includes the chunk number within the sequence
|
| 1051 |
+
# and total number of chunks.
|
| 1052 |
+
filename = f'combined_qr_{i+1}_of_{len(chunks_of_combined_data)}_{int(time.time())}.png'
|
| 1053 |
qr_path = generate_stylish_qr(
|
| 1054 |
+
data=chunk_str, # This `chunk_str` already contains the {startN} and {endN} tags
|
| 1055 |
filename=filename,
|
| 1056 |
fill_color="#1a365d",
|
| 1057 |
back_color="#ffffff"
|
|
|
|
| 1059 |
if qr_path:
|
| 1060 |
paths.append(qr_path)
|
| 1061 |
else:
|
| 1062 |
+
logger.warning(f"Failed to generate QR for combined chunk {i+1}/{len(chunks_of_combined_data)}.")
|
| 1063 |
else:
|
| 1064 |
+
# If not combined, each top-level item in the data list is processed individually.
|
| 1065 |
+
# Each individual item might itself be chunked into multiple QRs.
|
| 1066 |
+
if isinstance(data, list):
|
| 1067 |
for idx, item in enumerate(data):
|
| 1068 |
+
item_chunks = file_processor.chunk_data(item) # Chunk each item
|
| 1069 |
+
if not item_chunks:
|
| 1070 |
logger.warning(f"No chunks generated for item {idx+1}.")
|
| 1071 |
continue
|
| 1072 |
+
for chunk_idx, chunk_str in enumerate(item_chunks):
|
| 1073 |
+
filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(item_chunks)}_{int(time.time())}.png'
|
| 1074 |
qr_path = generate_stylish_qr(
|
| 1075 |
+
data=chunk_str, # This `chunk_str` already contains the {startN} and {endN} tags
|
| 1076 |
filename=filename,
|
| 1077 |
fill_color="#1a365d",
|
| 1078 |
back_color="#ffffff"
|
|
|
|
| 1080 |
if qr_path:
|
| 1081 |
paths.append(qr_path)
|
| 1082 |
else:
|
| 1083 |
+
logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(item_chunks)}.")
|
| 1084 |
+
elif isinstance(data, (dict, str)): # Handle single dict/string inputs if not a list
|
| 1085 |
+
single_item_chunks = file_processor.chunk_data(data)
|
| 1086 |
+
if not single_item_chunks:
|
| 1087 |
+
logger.warning("No chunks generated for single item.")
|
| 1088 |
+
return []
|
| 1089 |
+
for chunk_idx, chunk_str in enumerate(single_item_chunks):
|
| 1090 |
+
filename = f'single_item_chunk_{chunk_idx+1}_of_{len(single_item_chunks)}_{int(time.time())}.png'
|
| 1091 |
+
qr_path = generate_stylish_qr(
|
| 1092 |
+
data=chunk_str,
|
| 1093 |
+
filename=filename,
|
| 1094 |
+
fill_color="#1a365d",
|
| 1095 |
+
back_color="#ffffff"
|
| 1096 |
+
)
|
| 1097 |
+
if qr_path:
|
| 1098 |
+
paths.append(qr_path)
|
| 1099 |
+
else:
|
| 1100 |
+
logger.warning(f"Failed to generate QR for single item chunk {chunk_idx+1}/{len(single_item_chunks)}.")
|
| 1101 |
else:
|
| 1102 |
+
logger.warning("Data is not a list, dict, or string and cannot be processed individually.")
|
| 1103 |
|
| 1104 |
logger.info(f"Generated {len(paths)} QR codes.")
|
| 1105 |
return paths
|
|
|
|
| 1110 |
|
| 1111 |
# --- Chatbot Logic ---
|
| 1112 |
def respond_to_chat(
|
| 1113 |
+
message: str,
|
| 1114 |
+
chat_history: List[Tuple[str, str]],
|
| 1115 |
chatbot_data: Optional[List[Dict]],
|
| 1116 |
# Add current_filtered_df_state as input, it will be updated and returned
|
| 1117 |
+
current_filtered_df_state: Optional[pd.DataFrame]
|
| 1118 |
) -> Tuple[List[Tuple[str, str]], List[Dict], Optional[pd.DataFrame]]:
|
| 1119 |
"""
|
| 1120 |
Responds to user chat messages based on the loaded JSON data.
|
|
|
|
| 1129 |
response = ""
|
| 1130 |
lower_message = message.lower().strip()
|
| 1131 |
# Initialize new_filtered_df_state with the current state to preserve it unless a filter changes it
|
| 1132 |
+
new_filtered_df_state = current_filtered_df_state
|
| 1133 |
|
| 1134 |
try:
|
| 1135 |
# Attempt to flatten the data structure for easier querying
|
|
|
|
| 1274 |
new_filtered_df_state = None # Clear previous filter if column not found
|
| 1275 |
else:
|
| 1276 |
# IMPORTANT: Always filter from the original full dataframe 'df'
|
| 1277 |
+
active_df_to_filter = df.copy()
|
| 1278 |
try:
|
| 1279 |
# Attempt to infer value type for comparison
|
| 1280 |
target_value: Any
|
| 1281 |
col_dtype = df[column_name].dtype
|
| 1282 |
|
| 1283 |
+
# Check if current_filtered_df_state exists and is not empty, use it for filtering
|
| 1284 |
+
# Otherwise, use the full df
|
| 1285 |
+
df_to_filter = current_filtered_df_state if current_filtered_df_state is not None and not current_filtered_df_state.empty else df.copy()
|
| 1286 |
+
|
| 1287 |
if pd.api.types.is_numeric_dtype(col_dtype) and operator in ['>', '>=', '<', '<=', '==', '!=']:
|
| 1288 |
try:
|
| 1289 |
target_value = float(value_str)
|
| 1290 |
+
col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
|
| 1291 |
except ValueError:
|
| 1292 |
response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
|
| 1293 |
target_value = None # Error case
|
| 1294 |
elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
|
| 1295 |
target_value = value_str.lower() == 'true'
|
| 1296 |
+
col_series = df_to_filter[column_name].astype(bool, errors='ignore')
|
| 1297 |
else: # Assume string comparison otherwise
|
| 1298 |
target_value = str(value_str)
|
| 1299 |
+
col_series = df_to_filter[column_name].astype(str).str.lower() # Case-insensitive for strings
|
| 1300 |
value_str_lower = target_value.lower()
|
| 1301 |
|
| 1302 |
|
|
|
|
| 1334 |
|
| 1335 |
if condition is not None:
|
| 1336 |
# Apply condition to the active_df_to_filter (which is a copy of the full df)
|
| 1337 |
+
filtered_results_df = df_to_filter[condition] # Use df_to_filter here
|
| 1338 |
if not filtered_results_df.empty:
|
| 1339 |
new_filtered_df_state = filtered_results_df # Update state with new filter result
|
| 1340 |
num_results = len(filtered_results_df)
|
| 1341 |
preview_rows = min(num_results, 5)
|
| 1342 |
preview_cols = min(len(filtered_results_df.columns), 5)
|
| 1343 |
+
|
| 1344 |
preview_df = filtered_results_df.head(preview_rows).iloc[:, :preview_cols]
|
| 1345 |
preview_str = preview_df.to_string(index=False)
|
| 1346 |
+
|
| 1347 |
response = (f"Found {num_results} items where '{column_name}' {operator} '{value_str}'.\n"
|
| 1348 |
f"Here's a preview:\n```\n{preview_str}\n```\n"
|
| 1349 |
f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
|
|
|
|
| 1365 |
response = f"An error occurred while applying the filter: {e}"
|
| 1366 |
logger.error(f"Error applying filter (column='{column_name}', op='{operator}', val='{value_str}'): {e}")
|
| 1367 |
# If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results)
|
| 1368 |
+
|
| 1369 |
# --- End of Enhanced Filter Logic ---
|
| 1370 |
|
| 1371 |
# If `response` is still empty, it means no filter query was matched by the filter_match regex.
|
|
|
|
| 1730 |
viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'
|
| 1731 |
|
| 1732 |
if enabled_states is None or len(enabled_states) != num_qr_codes:
|
| 1733 |
+
# If states are not yet initialized or out of sync, enable all by default
|
| 1734 |
enabled_states = list(range(num_qr_codes))
|
| 1735 |
|
| 1736 |
for i, path in enumerate(paths):
|
|
|
|
| 1895 |
try:
|
| 1896 |
data_list = data_df.to_dict(orient='records')
|
| 1897 |
json_str = json.dumps(data_list, indent=2, ensure_ascii=False)
|
| 1898 |
+
|
| 1899 |
timestamp = int(time.time())
|
| 1900 |
filename = f"{filename_prefix}_{timestamp}.json"
|
| 1901 |
file_path = TEMP_DIR / filename
|
| 1902 |
+
|
| 1903 |
with open(file_path, 'w', encoding='utf-8') as f:
|
| 1904 |
f.write(json_str)
|
| 1905 |
+
|
| 1906 |
logger.info(f"Successfully created JSON file for download: {file_path}")
|
| 1907 |
return str(file_path)
|
| 1908 |
except Exception as e:
|
|
|
|
| 1914 |
logger.info("No full data available to download.")
|
| 1915 |
# Optionally, could return a gr.Warning or gr.Info to the UI if we had a dedicated status Textbox for downloads
|
| 1916 |
return None
|
| 1917 |
+
|
| 1918 |
# The chatbot_data state is a list of dicts. Convert to DataFrame for download_json_data.
|
| 1919 |
# The df created in respond_to_chat is not directly used here to ensure we get the *original* full data.
|
| 1920 |
try:
|
|
|
|
| 1930 |
except Exception as e:
|
| 1931 |
logger.error(f"Error converting full chatbot_data to DataFrame for download: {e}")
|
| 1932 |
return None
|
| 1933 |
+
|
| 1934 |
return download_json_data(df_to_download, "full_data")
|
| 1935 |
|
| 1936 |
def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
|
|
|
|
| 1959 |
- **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. **(Now performs real extraction)**
|
| 1960 |
- **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification.
|
| 1961 |
- **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item.
|
| 1962 |
+
- **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data, **including positional sequencing tags `{startN}` and `{endN}` in the QR code content**.
|
| 1963 |
- **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
|
| 1964 |
- **Modern Design**: Clean, responsive interface with visual feedback.
|
| 1965 |
- **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information.
|
|
|
|
| 1997 |
raise
|
| 1998 |
|
| 1999 |
if __name__ == "__main__":
|
| 2000 |
+
main()
|