Spaces:
Sleeping
Sleeping
| import json | |
| import pandas as pd | |
| import gradio as gr | |
| from typing import Dict, Any, Type | |
| from web2json.preprocessor import BasicPreprocessor | |
| from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient | |
| from web2json.postprocessor import PostProcessor | |
| from web2json.pipeline import Pipeline | |
| from pydantic import BaseModel, Field, create_model | |
| import os | |
| import dotenv | |
| import random | |
| import numpy as np | |
| import torch | |
| dotenv.load_dotenv() | |
| def seed_everything(seed=42): | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) # if using multi-GPU | |
| torch.backends.cudnn.deterministic = True | |
| torch.backends.cudnn.benchmark = False | |
| seed_everything(22) | |
| def parse_schema_input(schema_input: str) -> Type[BaseModel]: | |
| """ | |
| Convert user schema input to a Pydantic BaseModel. | |
| Supports multiple input formats: | |
| 1. JSON schema format | |
| 2. Python class definition | |
| 3. Simple field definitions | |
| """ | |
| schema_input = schema_input.strip() | |
| if not schema_input: | |
| # Default schema if none provided | |
| return create_model('DefaultSchema', | |
| title=(str, Field(description="Title of the content")), | |
| content=(str, Field(description="Main content"))) | |
| try: | |
| # Try parsing as JSON schema | |
| if schema_input.startswith('{'): | |
| schema_dict = json.loads(schema_input) | |
| return json_schema_to_basemodel(schema_dict) | |
| # Try parsing as Python class definition | |
| elif 'class ' in schema_input and 'BaseModel' in schema_input: | |
| return python_class_to_basemodel(schema_input) | |
| # Try parsing as simple field definitions | |
| else: | |
| return simple_fields_to_basemodel(schema_input) | |
| except Exception as e: | |
| raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.") | |
| def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]: | |
| """Convert JSON schema to BaseModel""" | |
| fields = {} | |
| properties = schema_dict.get('properties', {}) | |
| required = schema_dict.get('required', []) | |
| for field_name, field_info in properties.items(): | |
| field_type = get_python_type(field_info.get('type', 'string')) | |
| field_description = field_info.get('description', '') | |
| if field_name in required: | |
| fields[field_name] = (field_type, Field(description=field_description)) | |
| else: | |
| fields[field_name] = (field_type, Field(default=None, description=field_description)) | |
| return create_model('DynamicSchema', **fields) | |
| def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]: | |
| """Convert Python class definition to BaseModel""" | |
| try: | |
| # Execute the class definition in a safe namespace | |
| namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int, | |
| 'float': float, 'bool': bool, 'list': list, 'dict': dict} | |
| exec(class_definition, namespace) | |
| # Find the class that inherits from BaseModel | |
| for name, obj in namespace.items(): | |
| if (isinstance(obj, type) and | |
| issubclass(obj, BaseModel) and | |
| obj != BaseModel): | |
| return obj | |
| raise ValueError("No BaseModel class found in definition") | |
| except Exception as e: | |
| raise ValueError(f"Invalid Python class definition: {str(e)}") | |
| def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]: | |
| """Convert simple field definitions to BaseModel""" | |
| fields = {} | |
| for line in fields_text.strip().split('\n'): | |
| line = line.strip() | |
| if not line or line.startswith('#'): | |
| continue | |
| # Parse field definition (e.g., "name: str = description") | |
| if ':' in line: | |
| parts = line.split(':', 1) | |
| field_name = parts[0].strip() | |
| type_and_desc = parts[1].strip() | |
| if '=' in type_and_desc: | |
| type_part, desc_part = type_and_desc.split('=', 1) | |
| field_type = get_python_type(type_part.strip()) | |
| description = desc_part.strip().strip('"\'') | |
| else: | |
| field_type = get_python_type(type_and_desc.strip()) | |
| description = "" | |
| fields[field_name] = (field_type, Field(description=description)) | |
| else: | |
| # Simple field name only | |
| field_name = line.strip() | |
| fields[field_name] = (str, Field(description="")) | |
| if not fields: | |
| raise ValueError("No valid fields found in schema definition") | |
| return create_model('DynamicSchema', **fields) | |
| def get_python_type(type_str: str): | |
| """Convert type string to Python type""" | |
| type_str = type_str.lower().strip() | |
| type_mapping = { | |
| 'string': str, 'str': str, | |
| 'integer': int, 'int': int, | |
| 'number': float, 'float': float, | |
| 'boolean': bool, 'bool': bool, | |
| 'array': list, 'list': list, | |
| 'object': dict, 'dict': dict | |
| } | |
| return type_mapping.get(type_str, str) | |
| def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]: | |
| """Wrapper function that converts schema input to BaseModel""" | |
| try: | |
| # Parse the schema input into a BaseModel | |
| schema_model = parse_schema_input(schema_input) | |
| # Call the original function | |
| return webpage_to_json(content, is_url, schema_model) | |
| except Exception as e: | |
| return {"error": f"Schema parsing error: {str(e)}"} | |
| def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]: | |
| """ | |
| Extracts structured JSON information from a given content based on a specified schema. | |
| This function sets up a processing pipeline that includes: | |
| - Preprocessing the input content. | |
| - Utilizing an AI language model to extract information according to the provided schema. | |
| - Postprocessing the extracted output to match the exact schema requirements. | |
| Parameters: | |
| content (str): The input content to be analyzed. This can be direct text or a URL content. | |
| is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False). | |
| schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output. | |
| Returns: | |
| Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization | |
| or processing, the dictionary will include an "error" key with a descriptive message. | |
| """ | |
| prompt_template = """Extract the following information from the provided content according to the specified schema. | |
| Content to analyze: | |
| {content} | |
| Schema requirements: | |
| {schema} | |
| Instructions: | |
| - Extract only information that is explicitly present in the content | |
| - Follow the exact structure and data types specified in the schema | |
| - If a required field cannot be found, indicate this clearly | |
| - Preserve the original formatting and context where relevant | |
| - Return the extracted data in the format specified by the schema | |
| - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE | |
| - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS | |
| - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """ | |
| classification_prompt_template = schema.model_json_schema() | |
| # Initialize pipeline components | |
| # TODO: improve the RAG system and optimize (don't instantiate every time) | |
| preprocessor = BasicPreprocessor(config={'keep_tags': True}) | |
| try: | |
| # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')}) | |
| llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'}) | |
| # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\ | |
| reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-html-rerank.modal.run") | |
| except Exception as e: | |
| return {"error": f"Failed to initialize LLM client: {str(e)}"} | |
| # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template) | |
| ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template) | |
| postprocessor = PostProcessor() | |
| pipeline = Pipeline(preprocessor, ai_extractor, postprocessor) | |
| try: | |
| result = pipeline.run(content, is_url, schema) | |
| print("-"*80) | |
| print(f"Processed result: {result}") | |
| return result | |
| except Exception as e: | |
| return {"error": f"Processing error: {str(e)}"} | |
| # Example schemas for the user | |
| example_schemas = """ | |
| **Example Schema Formats:** | |
| 1. **Simple field definitions:** | |
| ``` | |
| title: str = Page title | |
| price: float = Product price | |
| description: str = Product description | |
| available: bool = Is available | |
| ``` | |
| 2. **JSON Schema:** | |
| ```json | |
| { | |
| "properties": { | |
| "title": {"type": "string", "description": "Page title"}, | |
| "price": {"type": "number", "description": "Product price"}, | |
| "description": {"type": "string", "description": "Product description"} | |
| }, | |
| "required": ["title"] | |
| } | |
| ``` | |
| 3. **Python Class Definition:** | |
| ```python | |
| class ProductSchema(BaseModel): | |
| title: str = Field(description="Product title") | |
| price: float = Field(description="Product price") | |
| description: str = Field(description="Product description") | |
| available: bool = Field(default=False, description="Availability status") | |
| ``` | |
| """ | |
| # Build Gradio Interface | |
| demo = gr.Interface( | |
| fn=webpage_to_json_wrapper, | |
| inputs=[ | |
| gr.Textbox( | |
| label="Content (URL or Raw Text)", | |
| lines=10, | |
| placeholder="Enter URL or paste raw HTML/text here." | |
| ), | |
| gr.Checkbox(label="Content is URL?", value=False), | |
| gr.Textbox( | |
| label="Schema Definition", | |
| lines=15, | |
| placeholder="Define your extraction schema (see examples below)", | |
| info=example_schemas | |
| ) | |
| ], | |
| outputs=gr.JSON(label="Output JSON"), | |
| title="Webpage to JSON Converter", | |
| description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.", | |
| examples=[ | |
| [ | |
| "https://example.com", | |
| True, | |
| "title: str = Page title\nprice: float = Product price\ndescription: str = Description" | |
| ], | |
| [ | |
| "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>", | |
| False, | |
| '''{ | |
| "type": "object", | |
| "properties": { | |
| "title": { | |
| "type": "string", | |
| "description": "Name of the product" | |
| }, | |
| "price": { | |
| "type": "number", | |
| "description": "Price of the product" | |
| }, | |
| "description": { | |
| "type": "string", | |
| "description": "Detailed description of the product" | |
| }, | |
| "availability": { | |
| "type": "boolean", | |
| "description": "Whether the product is in stock (true) or not (false)" | |
| } | |
| }, | |
| "required": ["title", "price"] | |
| }''' | |
| ] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(mcp_server=True) |