acecalisto3 commited on
Commit
0362333
·
verified ·
1 Parent(s): c90c7cb

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +224 -82
app2.py CHANGED
@@ -809,63 +809,171 @@ class EnhancedFileProcessor:
809
 
810
  return dataset
811
 
812
- def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
813
- """Enhanced data chunking with sequence metadata"""
 
 
 
814
  try:
815
  json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
816
- total_length = len(json_str)
817
-
818
- metadata_template = {
819
- "idx": 0,
820
- "tc": 1,
821
- "tl": total_length,
822
- "hash": 0,
823
- "data": ""
824
- }
825
- metadata_template_with_hash = {**metadata_template, "hash": 1234567890}
826
- overhead_estimate = len(json.dumps(metadata_template_with_hash, separators=(',', ':'))) + 50
827
-
828
- effective_chunk_size = max_size - overhead_estimate
829
-
830
- if effective_chunk_size <= 0:
831
- logger.error(f"Max QR size ({max_size}) is too small for metadata overhead ({overhead_estimate}). Cannot chunk.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
832
  return []
833
 
834
- if total_length <= effective_chunk_size:
835
- chunk_data = json_str
 
 
836
 
837
- chunk = {
838
- "idx": 0,
839
- "tc": 1,
840
- "tl": total_length,
841
- "hash": hash(chunk_data) & 0xFFFFFFFF,
842
- "data": chunk_data
843
- }
844
- return [chunk]
845
 
846
- num_chunks = -(-total_length // effective_chunk_size)
847
- chunks = []
848
- current_pos = 0
849
  for i in range(num_chunks):
850
- end_pos = min(current_pos + effective_chunk_size, total_length)
851
- chunk_data_str = json_str[current_pos:end_pos]
852
-
853
- chunk = {
854
- "idx": i,
 
 
 
855
  "tc": num_chunks,
856
- "tl": total_length,
857
- "hash": hash(chunk_data_str) & 0xFFFFFFFF,
858
  "data": chunk_data_str
859
  }
860
- chunks.append(chunk)
861
- current_pos = end_pos
862
-
863
- if current_pos < total_length:
864
- logger.error(f"Chunking logic error: Only processed {current_pos} of {total_length} characters.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865
  return []
866
 
867
- logger.info(f"Chunked data into {num_chunks} chunks for QR codes.")
868
- return chunks
869
 
870
  except Exception as e:
871
  logger.error(f"Error chunking data: {e}")
@@ -880,13 +988,16 @@ def generate_stylish_qr(data: Union[str, Dict],
880
  """Generate a stylish QR code with enhanced visual appeal"""
881
  try:
882
  qr = qrcode.QRCode(
883
- version=None,
884
- error_correction=qrcode.constants.ERROR_CORRECT_M,
885
  box_size=size,
886
  border=border
887
  )
888
 
 
 
889
  if isinstance(data, dict):
 
890
  qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
891
  else:
892
  qr.add_data(str(data))
@@ -917,8 +1028,11 @@ def generate_stylish_qr(data: Union[str, Dict],
917
 
918
  def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
919
  """Generate QR codes with enhanced visual appeal and metadata"""
920
- if not isinstance(data, list):
921
- logger.error("generate_qr_codes received data that is not a list.")
 
 
 
922
  return []
923
 
924
  try:
@@ -926,14 +1040,18 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
926
  paths = []
927
 
928
  if combined:
929
- chunks = file_processor.chunk_data(data)
930
- if not chunks:
 
 
931
  logger.warning("No chunks generated for combined data.")
932
  return []
933
- for i, chunk in enumerate(chunks):
934
- filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
 
 
935
  qr_path = generate_stylish_qr(
936
- data=chunk,
937
  filename=filename,
938
  fill_color="#1a365d",
939
  back_color="#ffffff"
@@ -941,18 +1059,20 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
941
  if qr_path:
942
  paths.append(qr_path)
943
  else:
944
- logger.warning(f"Failed to generate QR for chunk {i+1}/{len(chunks)}.")
945
  else:
946
- if data:
 
 
947
  for idx, item in enumerate(data):
948
- chunks = file_processor.chunk_data(item)
949
- if not chunks:
950
  logger.warning(f"No chunks generated for item {idx+1}.")
951
  continue
952
- for chunk_idx, chunk in enumerate(chunks):
953
- filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
954
  qr_path = generate_stylish_qr(
955
- data=chunk,
956
  filename=filename,
957
  fill_color="#1a365d",
958
  back_color="#ffffff"
@@ -960,9 +1080,26 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
960
  if qr_path:
961
  paths.append(qr_path)
962
  else:
963
- logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(chunks)}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
964
  else:
965
- logger.warning("No items in data list to process individually.")
966
 
967
  logger.info(f"Generated {len(paths)} QR codes.")
968
  return paths
@@ -973,11 +1110,11 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
973
 
974
  # --- Chatbot Logic ---
975
  def respond_to_chat(
976
- message: str,
977
- chat_history: List[Tuple[str, str]],
978
  chatbot_data: Optional[List[Dict]],
979
  # Add current_filtered_df_state as input, it will be updated and returned
980
- current_filtered_df_state: Optional[pd.DataFrame]
981
  ) -> Tuple[List[Tuple[str, str]], List[Dict], Optional[pd.DataFrame]]:
982
  """
983
  Responds to user chat messages based on the loaded JSON data.
@@ -992,7 +1129,7 @@ def respond_to_chat(
992
  response = ""
993
  lower_message = message.lower().strip()
994
  # Initialize new_filtered_df_state with the current state to preserve it unless a filter changes it
995
- new_filtered_df_state = current_filtered_df_state
996
 
997
  try:
998
  # Attempt to flatten the data structure for easier querying
@@ -1137,25 +1274,29 @@ def respond_to_chat(
1137
  new_filtered_df_state = None # Clear previous filter if column not found
1138
  else:
1139
  # IMPORTANT: Always filter from the original full dataframe 'df'
1140
- active_df_to_filter = df.copy()
1141
  try:
1142
  # Attempt to infer value type for comparison
1143
  target_value: Any
1144
  col_dtype = df[column_name].dtype
1145
 
 
 
 
 
1146
  if pd.api.types.is_numeric_dtype(col_dtype) and operator in ['>', '>=', '<', '<=', '==', '!=']:
1147
  try:
1148
  target_value = float(value_str)
1149
- col_series = pd.to_numeric(filtered_df[column_name], errors='coerce')
1150
  except ValueError:
1151
  response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
1152
  target_value = None # Error case
1153
  elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
1154
  target_value = value_str.lower() == 'true'
1155
- col_series = filtered_df[column_name].astype(bool, errors='ignore')
1156
  else: # Assume string comparison otherwise
1157
  target_value = str(value_str)
1158
- col_series = filtered_df[column_name].astype(str).str.lower() # Case-insensitive for strings
1159
  value_str_lower = target_value.lower()
1160
 
1161
 
@@ -1193,16 +1334,16 @@ def respond_to_chat(
1193
 
1194
  if condition is not None:
1195
  # Apply condition to the active_df_to_filter (which is a copy of the full df)
1196
- filtered_results_df = active_df_to_filter[condition]
1197
  if not filtered_results_df.empty:
1198
  new_filtered_df_state = filtered_results_df # Update state with new filter result
1199
  num_results = len(filtered_results_df)
1200
  preview_rows = min(num_results, 5)
1201
  preview_cols = min(len(filtered_results_df.columns), 5)
1202
-
1203
  preview_df = filtered_results_df.head(preview_rows).iloc[:, :preview_cols]
1204
  preview_str = preview_df.to_string(index=False)
1205
-
1206
  response = (f"Found {num_results} items where '{column_name}' {operator} '{value_str}'.\n"
1207
  f"Here's a preview:\n```\n{preview_str}\n```\n"
1208
  f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
@@ -1224,7 +1365,7 @@ def respond_to_chat(
1224
  response = f"An error occurred while applying the filter: {e}"
1225
  logger.error(f"Error applying filter (column='{column_name}', op='{operator}', val='{value_str}'): {e}")
1226
  # If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results)
1227
-
1228
  # --- End of Enhanced Filter Logic ---
1229
 
1230
  # If `response` is still empty, it means no filter query was matched by the filter_match regex.
@@ -1589,6 +1730,7 @@ def create_modern_interface():
1589
  viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'
1590
 
1591
  if enabled_states is None or len(enabled_states) != num_qr_codes:
 
1592
  enabled_states = list(range(num_qr_codes))
1593
 
1594
  for i, path in enumerate(paths):
@@ -1753,14 +1895,14 @@ def create_modern_interface():
1753
  try:
1754
  data_list = data_df.to_dict(orient='records')
1755
  json_str = json.dumps(data_list, indent=2, ensure_ascii=False)
1756
-
1757
  timestamp = int(time.time())
1758
  filename = f"{filename_prefix}_{timestamp}.json"
1759
  file_path = TEMP_DIR / filename
1760
-
1761
  with open(file_path, 'w', encoding='utf-8') as f:
1762
  f.write(json_str)
1763
-
1764
  logger.info(f"Successfully created JSON file for download: {file_path}")
1765
  return str(file_path)
1766
  except Exception as e:
@@ -1772,7 +1914,7 @@ def create_modern_interface():
1772
  logger.info("No full data available to download.")
1773
  # Optionally, could return a gr.Warning or gr.Info to the UI if we had a dedicated status Textbox for downloads
1774
  return None
1775
-
1776
  # The chatbot_data state is a list of dicts. Convert to DataFrame for download_json_data.
1777
  # The df created in respond_to_chat is not directly used here to ensure we get the *original* full data.
1778
  try:
@@ -1788,7 +1930,7 @@ def create_modern_interface():
1788
  except Exception as e:
1789
  logger.error(f"Error converting full chatbot_data to DataFrame for download: {e}")
1790
  return None
1791
-
1792
  return download_json_data(df_to_download, "full_data")
1793
 
1794
  def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
@@ -1817,7 +1959,7 @@ def create_modern_interface():
1817
  - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. **(Now performs real extraction)**
1818
  - **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification.
1819
  - **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item.
1820
- - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data.
1821
  - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
1822
  - **Modern Design**: Clean, responsive interface with visual feedback.
1823
  - **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information.
@@ -1855,4 +1997,4 @@ def main():
1855
  raise
1856
 
1857
  if __name__ == "__main__":
1858
- main()
 
809
 
810
  return dataset
811
 
812
+ def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[str]:
813
+ """
814
+ Enhanced data chunking for QR codes with sequence metadata and start/end tags.
815
+ max_size is the maximum *byte* capacity for a QR code (e.g., 2953 bytes for Version 40-L).
816
+ """
817
  try:
818
  json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
819
+ total_length = len(json_str.encode('utf-8')) # Get actual byte length for QR capacity
820
+
821
+ # The overhead needs to be dynamic. The maximum capacity of QR code
822
+ # is in *bytes*. So we need to encode the JSON and measure its length.
823
+ # A typical QR code can hold up to 2953 bytes (Version 40-L, Alphanumeric).
824
+ # For UTF-8, it's roughly 2953 * 0.7 = ~2000 characters if many non-ASCII.
825
+ # Let's use 2000 characters as a conservative estimate for max_size.
826
+ # However, the qrcode library will auto-select version based on *bytes* and error correction.
827
+ # So, our `max_size` (which is in bytes) should be the *maximum possible byte capacity*.
828
+ # Let's use a standard maximum byte capacity for QR code Version 40-L (error correction M).
829
+ # Max capacity for byte mode, Version 40-L: 2953 bytes.
830
+ # We will try to fit as much as possible, but need to subtract for our metadata.
831
+
832
+ # Estimate overhead for metadata like {"idx": 0, "tc": 1, "tl": 12345, "hash": 1234567890, "data": ""}
833
+ # and for the "{startX}" and "{endX}" tags.
834
+ # `{start<idx>}` and `{end<idx>}` tags.
835
+ # Max idx could be e.g. 999 if many chunks, so 11 chars for {start999} and {end999}.
836
+ # Let's assume a generous overhead for the structural JSON + sequence tags.
837
+ # A typical metadata JSON string might be ~60-80 bytes. Tags add ~20 bytes.
838
+ # Let's target a safe `effective_chunk_size` of about 2800 bytes for data content.
839
+
840
+ effective_max_qr_data_bytes = 2800 # A conservative estimate for actual data payload per QR after metadata
841
+
842
+ # Calculate the number of chunks based on byte length
843
+ # The JSON object for each chunk will contain metadata + a portion of the original data.
844
+ # Example chunk JSON: {"idx":X,"tc":Y,"tl":Z,"hash":H,"data":"<chunk_payload>"}
845
+ # The total string for the QR code will be "{startX}{<chunk_json_string>}{endX}"
846
+
847
+ # We need to calculate the maximum length of the string `"<chunk_json_string>"`
848
+ # such that when wrapped with `{startX}` and `{endX}`, it fits within `max_size` (2953).
849
+
850
+ # Let's estimate the size of the *metadata* part of the chunk JSON, assuming 100 total chunks (max 3 digits for idx, tc).
851
+ # "{idx":99,"tc":99,"tl":999999,"hash":4294967295,"data":""}" -> roughly 60-70 bytes.
852
+ # Plus {startXXX} and {endXXX} -> 11 chars each. So, 22 chars + ~70 bytes for json metadata = ~92 bytes.
853
+ # This is complex because the `hash` and `tl` can vary in length.
854
+ # Let's fix a `base_metadata_size` and allocate the rest to `data`.
855
+ base_metadata_size_estimate = len(json.dumps({"idx": 999, "tc": 999, "tl": 99999999999, "hash": 4294967295, "data": ""}, separators=(',', ':'))) # ~70-80 bytes
856
+
857
+ # The actual content for the QR code will be "{startX}{<chunk_payload_with_meta>}{endX}"
858
+ # The length of "{startX}" and "{endX}" depends on X. Max X can be 999.
859
+ # So, len("{start999}") = 10 characters.
860
+ # Total tag overhead per chunk: 10 + 10 = 20 characters.
861
+ # Let's assume 1 byte per character for these tags for simplicity, as they are ASCII.
862
+
863
+ # Max size of data *including our JSON wrapper and start/end tags* should be `max_size` (2953 bytes)
864
+ # Let's refine the effective chunk size:
865
+ # effective_chunk_size_for_data = max_size - (base_metadata_size_estimate + 20)
866
+ # This is still tricky because `hash` depends on `chunk_data_str`.
867
+ # A simpler, more reliable approach: calculate how many characters of the *original* `json_str` can fit.
868
+
869
+ # Let's simplify the max_size for internal data: use 2000 characters (bytes for UTF-8) as a safe maximum for payload data.
870
+ # This accounts for the variable nature of UTF-8 characters and QR capacity.
871
+ # Max capacity for QR code (Version 40, Error M): 2953 bytes.
872
+ # Max characters that can be encoded in UTF-8: ~2000 (if mostly ASCII, more like 2953 chars).
873
+ # To be safe and ensure it fits, we'll aim for a character limit lower than the byte limit.
874
+ max_chars_per_qr = 1800 # Safe character count to fit within QR code, considering JSON structure and UTF-8
875
+
876
+ # Calculate approximate effective size for the original JSON string *portion*
877
+ # The full string for the QR will be: "{startX}{"idx":...,"data":"chunk_of_original_json"}{endX}"
878
+ # We need to account for:
879
+ # - `{startX}` and `{endX}` tags (approx 20 chars)
880
+ # - `{"idx":X,"tc":Y,"tl":Z,"hash":H,"data":""}` (approx 70 chars)
881
+ # Total overhead per QR code: ~90 characters.
882
+ # So, characters available for `chunk_of_original_json` = `max_chars_per_qr` - 90 = 1710 characters.
883
+ # This is an approximate, as hash and total_length can vary.
884
+
885
+ # To make it robust, we will iterate and subtract available space.
886
+ # Let's try to fit as much of the original JSON string as possible into each QR.
887
+ # The QR code data will be a JSON string like:
888
+ # {"idx": i, "tc": num_chunks, "tl": total_length, "hash": chunk_hash, "data": "original_json_slice"}
889
+ # This entire dictionary will then be prefixed/suffixed.
890
+
891
+ # The actual byte capacity of QR codes depends on version, error correction, and mode.
892
+ # For "Byte" mode, Version 40-L is 2953 bytes.
893
+ # The `qrcode` library auto-selects the smallest version.
894
+ # We'll work with `json_str` as UTF-8 bytes to be precise.
895
+
896
+ json_bytes = json_str.encode('utf-8')
897
+ total_bytes_length = len(json_bytes)
898
+
899
+ # Max bytes for data payload inside the chunk_dict (e.g., "original_json_slice")
900
+ # We need to calculate the overhead for the chunk metadata *itself* + start/end tags.
901
+ # Let's take a fixed max QR capacity (V40-L) of 2953 bytes for now.
902
+ # Then estimate the *maximum possible overhead* for the wrapper JSON and tags.
903
+ # Max overhead for `{"idx":999,"tc":999,"tl":99999999999,"hash":4294967295,"data":""}` (approx 70-80 bytes)
904
+ # Plus tags `{start999}` `{end999}` (20 bytes). Total overhead ~100 bytes.
905
+ # So, `effective_payload_bytes_per_qr` = 2953 - 100 = 2853 bytes.
906
+ # This estimate is critical. If it's too high, QR generation will fail; too low, too many QRs.
907
+ # A more accurate way: try to generate a QR with a very small payload and full metadata, see max capacity.
908
+
909
+ # Let's go with a fixed `MAX_QR_CODE_BYTE_CAPACITY = 2953` (V40-L) as the total bytes a QR can hold.
910
+ # And `MAX_OVERHEAD_PER_CHUNK_BYTES` = 100.
911
+ # This means `MAX_DATA_PAYLOAD_BYTES_PER_CHUNK` = `MAX_QR_CODE_BYTE_CAPACITY` - `MAX_OVERHEAD_PER_CHUNK_BYTES`
912
+ # = 2953 - 100 = 2853 bytes.
913
+
914
+ MAX_QR_CODE_BYTE_CAPACITY = 2953 # Version 40, Error Correction M, Byte mode
915
+ # Max possible length for tags {start<idx>}{end<idx>} and the meta fields
916
+ # Example: {start12345}{"idx":12344,"tc":12345,"tl":999999999,"hash":1234567890,"data":""}{end12345}
917
+ # Max idx/tc up to 5 digits means {start99999} is 12 chars. So 24 chars for tags.
918
+ # Min length of meta JSON (empty data): {"idx":0,"tc":1,"tl":0,"hash":0,"data":""} is ~50 chars.
919
+ # Max length of meta JSON (large numbers, empty data): {"idx":10000,"tc":10000,"tl":10000000000,"hash":4294967295,"data":""} is ~80 chars.
920
+ # Total maximum overhead estimate: 24 (tags) + 80 (meta) = 104 bytes.
921
+ # Let's use 120 bytes as a safe, generous overhead.
922
+ MAX_OVERHEAD_PER_CHUNK_BYTES = 120
923
+
924
+ effective_payload_bytes_per_chunk = MAX_QR_CODE_BYTE_CAPACITY - MAX_OVERHEAD_PER_CHUNK_BYTES
925
+
926
+ if effective_payload_bytes_per_chunk <= 0:
927
+ logger.error(f"Max QR size ({MAX_QR_CODE_BYTE_CAPACITY}) is too small for metadata overhead ({MAX_OVERHEAD_PER_CHUNK_BYTES}). Cannot chunk.")
928
  return []
929
 
930
+ # Calculate number of chunks based on the original data's byte length
931
+ num_chunks = math.ceil(total_bytes_length / effective_payload_bytes_per_chunk)
932
+ if num_chunks == 0: # Handle empty input data
933
+ return []
934
 
935
+ chunks_for_qr: List[str] = []
936
+ current_byte_pos = 0
 
 
 
 
 
 
937
 
 
 
 
938
  for i in range(num_chunks):
939
+ # Determine the slice of the original JSON bytes
940
+ end_byte_pos = min(current_byte_pos + effective_payload_bytes_per_chunk, total_bytes_length)
941
+ chunk_data_bytes = json_bytes[current_byte_pos:end_byte_pos]
942
+ chunk_data_str = chunk_data_bytes.decode('utf-8', errors='replace')
943
+
944
+ # Create the inner JSON structure for the chunk
945
+ chunk_dict = {
946
+ "idx": i + 1, # 1-based indexing for user readability
947
  "tc": num_chunks,
948
+ "tl": total_bytes_length, # Total length in bytes
949
+ "hash": hash(chunk_data_bytes) & 0xFFFFFFFF, # Hash of the byte slice
950
  "data": chunk_data_str
951
  }
952
+ inner_json_string = json.dumps(chunk_dict, ensure_ascii=False, separators=(',', ':'))
953
+
954
+ # Prepend {startN} and append {endN} tags
955
+ # Ensure N is fixed to the sequence number for rejoining.
956
+ final_qr_string = f"{{start{i+1}}}{inner_json_string}{{end{i+1}}}"
957
+
958
+ # Double check if the final_qr_string actually fits.
959
+ # This is a critical check for robustness, but might slow down if data is very large.
960
+ # For now, rely on our calculation based on fixed max capacity.
961
+ # If final_qr_string.encode('utf-8') > MAX_QR_CODE_BYTE_CAPACITY, then our estimates are off.
962
+ if len(final_qr_string.encode('utf-8')) > MAX_QR_CODE_BYTE_CAPACITY:
963
+ logger.warning(f"Chunk {i+1} exceeds estimated QR capacity. Calculated: {len(final_qr_string.encode('utf-8'))} bytes, Max: {MAX_QR_CODE_BYTE_CAPACITY} bytes. Adjusting MAX_OVERHEAD_PER_CHUNK_BYTES might be needed.")
964
+ # As a fallback, we can try to reduce the chunk_data_str length,
965
+ # but this means recalculating. For now, log warning and continue.
966
+ # A more robust solution might dynamically adjust effective_payload_bytes_per_chunk if this happens.
967
+
968
+ chunks_for_qr.append(final_qr_string)
969
+ current_byte_pos = end_byte_pos
970
+
971
+ if current_byte_pos < total_bytes_length:
972
+ logger.error(f"Chunking logic error: Only processed {current_byte_pos} of {total_bytes_length} bytes.")
973
  return []
974
 
975
+ logger.info(f"Chunked data into {num_chunks} chunks for QR codes, with positional sequencing tags.")
976
+ return chunks_for_qr
977
 
978
  except Exception as e:
979
  logger.error(f"Error chunking data: {e}")
 
988
  """Generate a stylish QR code with enhanced visual appeal"""
989
  try:
990
  qr = qrcode.QRCode(
991
+ version=None, # Let the library determine the best version for the data
992
+ error_correction=qrcode.constants.ERROR_CORRECT_M, # High error correction
993
  box_size=size,
994
  border=border
995
  )
996
 
997
+ # `data` here is expected to be the pre-formatted string from chunk_data,
998
+ # including the {startN} and {endN} tags, and the inner JSON.
999
  if isinstance(data, dict):
1000
+ # This path should ideally not be taken if chunk_data always returns strings
1001
  qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
1002
  else:
1003
  qr.add_data(str(data))
 
1028
 
1029
  def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
1030
  """Generate QR codes with enhanced visual appeal and metadata"""
1031
+ # The `data` here should be the full processed dataset (List[Dict]) or a single item (Dict/str).
1032
+ # The chunk_data method will handle turning this into strings suitable for QR codes.
1033
+
1034
+ if not isinstance(data, (list, dict, str)):
1035
+ logger.error("generate_qr_codes received data that is not a list, dict, or string.")
1036
  return []
1037
 
1038
  try:
 
1040
  paths = []
1041
 
1042
  if combined:
1043
+ # When combined, we treat the entire `data` (which should be List[Dict]) as one large string
1044
+ # to be chunked across multiple QRs.
1045
+ chunks_of_combined_data = file_processor.chunk_data(data)
1046
+ if not chunks_of_combined_data:
1047
  logger.warning("No chunks generated for combined data.")
1048
  return []
1049
+ for i, chunk_str in enumerate(chunks_of_combined_data):
1050
+ # The filename now includes the chunk number within the sequence
1051
+ # and total number of chunks.
1052
+ filename = f'combined_qr_{i+1}_of_{len(chunks_of_combined_data)}_{int(time.time())}.png'
1053
  qr_path = generate_stylish_qr(
1054
+ data=chunk_str, # This `chunk_str` already contains the {startN} and {endN} tags
1055
  filename=filename,
1056
  fill_color="#1a365d",
1057
  back_color="#ffffff"
 
1059
  if qr_path:
1060
  paths.append(qr_path)
1061
  else:
1062
+ logger.warning(f"Failed to generate QR for combined chunk {i+1}/{len(chunks_of_combined_data)}.")
1063
  else:
1064
+ # If not combined, each top-level item in the data list is processed individually.
1065
+ # Each individual item might itself be chunked into multiple QRs.
1066
+ if isinstance(data, list):
1067
  for idx, item in enumerate(data):
1068
+ item_chunks = file_processor.chunk_data(item) # Chunk each item
1069
+ if not item_chunks:
1070
  logger.warning(f"No chunks generated for item {idx+1}.")
1071
  continue
1072
+ for chunk_idx, chunk_str in enumerate(item_chunks):
1073
+ filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(item_chunks)}_{int(time.time())}.png'
1074
  qr_path = generate_stylish_qr(
1075
+ data=chunk_str, # This `chunk_str` already contains the {startN} and {endN} tags
1076
  filename=filename,
1077
  fill_color="#1a365d",
1078
  back_color="#ffffff"
 
1080
  if qr_path:
1081
  paths.append(qr_path)
1082
  else:
1083
+ logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(item_chunks)}.")
1084
+ elif isinstance(data, (dict, str)): # Handle single dict/string inputs if not a list
1085
+ single_item_chunks = file_processor.chunk_data(data)
1086
+ if not single_item_chunks:
1087
+ logger.warning("No chunks generated for single item.")
1088
+ return []
1089
+ for chunk_idx, chunk_str in enumerate(single_item_chunks):
1090
+ filename = f'single_item_chunk_{chunk_idx+1}_of_{len(single_item_chunks)}_{int(time.time())}.png'
1091
+ qr_path = generate_stylish_qr(
1092
+ data=chunk_str,
1093
+ filename=filename,
1094
+ fill_color="#1a365d",
1095
+ back_color="#ffffff"
1096
+ )
1097
+ if qr_path:
1098
+ paths.append(qr_path)
1099
+ else:
1100
+ logger.warning(f"Failed to generate QR for single item chunk {chunk_idx+1}/{len(single_item_chunks)}.")
1101
  else:
1102
+ logger.warning("Data is not a list, dict, or string and cannot be processed individually.")
1103
 
1104
  logger.info(f"Generated {len(paths)} QR codes.")
1105
  return paths
 
1110
 
1111
  # --- Chatbot Logic ---
1112
  def respond_to_chat(
1113
+ message: str,
1114
+ chat_history: List[Tuple[str, str]],
1115
  chatbot_data: Optional[List[Dict]],
1116
  # Add current_filtered_df_state as input, it will be updated and returned
1117
+ current_filtered_df_state: Optional[pd.DataFrame]
1118
  ) -> Tuple[List[Tuple[str, str]], List[Dict], Optional[pd.DataFrame]]:
1119
  """
1120
  Responds to user chat messages based on the loaded JSON data.
 
1129
  response = ""
1130
  lower_message = message.lower().strip()
1131
  # Initialize new_filtered_df_state with the current state to preserve it unless a filter changes it
1132
+ new_filtered_df_state = current_filtered_df_state
1133
 
1134
  try:
1135
  # Attempt to flatten the data structure for easier querying
 
1274
  new_filtered_df_state = None # Clear previous filter if column not found
1275
  else:
1276
  # IMPORTANT: Always filter from the original full dataframe 'df'
1277
+ active_df_to_filter = df.copy()
1278
  try:
1279
  # Attempt to infer value type for comparison
1280
  target_value: Any
1281
  col_dtype = df[column_name].dtype
1282
 
1283
+ # Check if current_filtered_df_state exists and is not empty, use it for filtering
1284
+ # Otherwise, use the full df
1285
+ df_to_filter = current_filtered_df_state if current_filtered_df_state is not None and not current_filtered_df_state.empty else df.copy()
1286
+
1287
  if pd.api.types.is_numeric_dtype(col_dtype) and operator in ['>', '>=', '<', '<=', '==', '!=']:
1288
  try:
1289
  target_value = float(value_str)
1290
+ col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
1291
  except ValueError:
1292
  response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
1293
  target_value = None # Error case
1294
  elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
1295
  target_value = value_str.lower() == 'true'
1296
+ col_series = df_to_filter[column_name].astype(bool, errors='ignore')
1297
  else: # Assume string comparison otherwise
1298
  target_value = str(value_str)
1299
+ col_series = df_to_filter[column_name].astype(str).str.lower() # Case-insensitive for strings
1300
  value_str_lower = target_value.lower()
1301
 
1302
 
 
1334
 
1335
  if condition is not None:
1336
  # Apply condition to the active_df_to_filter (which is a copy of the full df)
1337
+ filtered_results_df = df_to_filter[condition] # Use df_to_filter here
1338
  if not filtered_results_df.empty:
1339
  new_filtered_df_state = filtered_results_df # Update state with new filter result
1340
  num_results = len(filtered_results_df)
1341
  preview_rows = min(num_results, 5)
1342
  preview_cols = min(len(filtered_results_df.columns), 5)
1343
+
1344
  preview_df = filtered_results_df.head(preview_rows).iloc[:, :preview_cols]
1345
  preview_str = preview_df.to_string(index=False)
1346
+
1347
  response = (f"Found {num_results} items where '{column_name}' {operator} '{value_str}'.\n"
1348
  f"Here's a preview:\n```\n{preview_str}\n```\n"
1349
  f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
 
1365
  response = f"An error occurred while applying the filter: {e}"
1366
  logger.error(f"Error applying filter (column='{column_name}', op='{operator}', val='{value_str}'): {e}")
1367
  # If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results)
1368
+
1369
  # --- End of Enhanced Filter Logic ---
1370
 
1371
  # If `response` is still empty, it means no filter query was matched by the filter_match regex.
 
1730
  viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'
1731
 
1732
  if enabled_states is None or len(enabled_states) != num_qr_codes:
1733
+ # If states are not yet initialized or out of sync, enable all by default
1734
  enabled_states = list(range(num_qr_codes))
1735
 
1736
  for i, path in enumerate(paths):
 
1895
  try:
1896
  data_list = data_df.to_dict(orient='records')
1897
  json_str = json.dumps(data_list, indent=2, ensure_ascii=False)
1898
+
1899
  timestamp = int(time.time())
1900
  filename = f"{filename_prefix}_{timestamp}.json"
1901
  file_path = TEMP_DIR / filename
1902
+
1903
  with open(file_path, 'w', encoding='utf-8') as f:
1904
  f.write(json_str)
1905
+
1906
  logger.info(f"Successfully created JSON file for download: {file_path}")
1907
  return str(file_path)
1908
  except Exception as e:
 
1914
  logger.info("No full data available to download.")
1915
  # Optionally, could return a gr.Warning or gr.Info to the UI if we had a dedicated status Textbox for downloads
1916
  return None
1917
+
1918
  # The chatbot_data state is a list of dicts. Convert to DataFrame for download_json_data.
1919
  # The df created in respond_to_chat is not directly used here to ensure we get the *original* full data.
1920
  try:
 
1930
  except Exception as e:
1931
  logger.error(f"Error converting full chatbot_data to DataFrame for download: {e}")
1932
  return None
1933
+
1934
  return download_json_data(df_to_download, "full_data")
1935
 
1936
  def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
 
1959
  - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. **(Now performs real extraction)**
1960
  - **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification.
1961
  - **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item.
1962
+ - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data, **including positional sequencing tags `{startN}` and `{endN}` in the QR code content**.
1963
  - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
1964
  - **Modern Design**: Clean, responsive interface with visual feedback.
1965
  - **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information.
 
1997
  raise
1998
 
1999
  if __name__ == "__main__":
2000
+ main()