Spaces:
Running
Running
| # web_indexer_universal_v7.py | |
| # EGYSZERŰSÍTETT VERZIÓ: A szinonima-kezelés teljesen eltávolítva. | |
| # Támogatja az Elastic Cloud-ot, biztonságos konfigurációkezeléssel. | |
| import os | |
| import time | |
| import traceback | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from collections import deque | |
| from elasticsearch import Elasticsearch, helpers, exceptions as es_exceptions | |
| import sys | |
| import warnings | |
| from dotenv import load_dotenv | |
| # === ANSI Színkódok (konzol loggoláshoz) === | |
| GREEN = '\033[92m' | |
| YELLOW = '\033[93m' | |
| RED = '\033[91m' | |
| RESET = '\033[0m' | |
| BLUE = '\033[94m' | |
| CYAN = '\033[96m' | |
| MAGENTA = '\033[95m' | |
| # --- Konfiguráció betöltése környezeti változókból --- | |
| load_dotenv() | |
| CONFIG = { | |
| # --- Alap beállítások (felülírhatók .env fájlból) --- | |
| "START_URL": os.getenv("START_URL", "https://www.dunaelektronika.com/"), | |
| "MAX_DEPTH": int(os.getenv("MAX_DEPTH", 2)), | |
| "REQUEST_DELAY": int(os.getenv("REQUEST_DELAY", 1)), | |
| "USER_AGENT": os.getenv("USER_AGENT", "MyPythonCrawler/1.0 (+http://example.com/botinfo)"), | |
| "VECTOR_INDEX_NAME": os.getenv("VECTOR_INDEX_NAME", "dunawebindexai"), | |
| "BATCH_SIZE": int(os.getenv("BATCH_SIZE", 50)), | |
| "ES_CLIENT_TIMEOUT": int(os.getenv("ES_CLIENT_TIMEOUT", 120)), | |
| "EMBEDDING_MODEL_NAME": 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', | |
| "CHUNK_SIZE_TOKENS": int(os.getenv("CHUNK_SIZE_TOKENS", 500)), | |
| "CHUNK_OVERLAP_TOKENS": int(os.getenv("CHUNK_OVERLAP_TOKENS", 50)), | |
| "MIN_CHUNK_SIZE_CHARS": int(os.getenv("MIN_CHUNK_SIZE_CHARS", 50)), | |
| "LLM_MODEL_NAME": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", | |
| "LLM_CHUNK_MODEL": "mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| "DEBUG_MODE": os.getenv("DEBUG_MODE", "True").lower() == 'true', | |
| # --- Kötelező, érzékeny adatok --- | |
| "ES_CLOUD_ID": os.getenv("ES_CLOUD_ID"), | |
| "ES_API_KEY": os.getenv("ES_API_KEY"), | |
| "TOGETHER_API_KEY": os.getenv("TOGETHER_API_KEY") | |
| } | |
| CONFIG["TARGET_DOMAIN"] = urlparse(CONFIG["START_URL"]).netloc | |
| embedding_model = None | |
| EMBEDDING_DIM = None | |
| device = 'cpu' | |
| together_client = None | |
| # --- LLM és egyéb könyvtárak ellenőrzése és importálása --- | |
| try: | |
| import torch | |
| TORCH_AVAILABLE = True | |
| except ImportError: | |
| TORCH_AVAILABLE = False | |
| print(f"{RED}FIGYELEM: Torch nincs telepítve.{RESET}") | |
| try: | |
| import together | |
| if not CONFIG["TOGETHER_API_KEY"]: | |
| print(f"{RED}Hiba: TOGETHER_API_KEY nincs beállítva.{RESET}") | |
| else: | |
| together_client = together.Together(api_key=CONFIG["TOGETHER_API_KEY"]) | |
| print(f"{GREEN}Together AI kliens inicializálva.{RESET}") | |
| except ImportError: | |
| print(f"{YELLOW}Figyelem: together könyvtár nincs telepítve.{RESET}") | |
| together_client = None | |
| except Exception as e: | |
| print(f"{RED}Hiba LLM backend inicializálásakor: {e}{RESET}") | |
| together_client = None | |
| try: | |
| import tiktoken | |
| tiktoken_encoder = tiktoken.get_encoding("cl100k_base") | |
| TIKTOKEN_AVAILABLE = True | |
| except ImportError: | |
| TIKTOKEN_AVAILABLE = False | |
| print(f"{YELLOW}Figyelem: tiktoken nincs telepítve.{RESET}") | |
| try: | |
| import nltk | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| print(f"{CYAN}NLTK 'punkt' letöltése...{RESET}"); | |
| nltk.download('punkt', quiet=True) | |
| NLTK_AVAILABLE = True | |
| except ImportError: | |
| NLTK_AVAILABLE = False | |
| print(f"{RED}HIBA: 'nltk' nincs telepítve!{RESET}") | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| SENTENCE_TRANSFORMER_AVAILABLE = True | |
| except ImportError: | |
| SENTENCE_TRANSFORMER_AVAILABLE = False | |
| print(f"{RED}HIBA: 'sentence-transformers' nincs telepítve!{RESET}") | |
| try: | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| sys.stderr.reconfigure(encoding='utf-8') | |
| except AttributeError: | |
| pass | |
| # --- LLM HÁTTÉR FUNKCIÓK --- | |
| def generate_categories_with_llm(llm_client, soup, text): | |
| category_list = ['IT biztonsági szolgáltatások', 'szolgáltatások', 'hardver', 'szoftver', 'hírek', 'audiovizuális konferenciatechnika'] | |
| try: | |
| breadcrumb = soup.find('nav', class_='breadcrumb') | |
| if breadcrumb: | |
| categories = [li.get_text(strip=True) for li in breadcrumb.find_all('li')] | |
| if categories: | |
| final_category_from_html = categories[-1] | |
| for cat in category_list: | |
| if cat.lower() in final_category_from_html.lower(): | |
| return [cat] | |
| except Exception: pass | |
| try: | |
| h1_tag = soup.find('h1') | |
| if h1_tag and h1_tag.get_text(strip=True): | |
| h1_text = h1_tag.get_text(strip=True) | |
| for cat in category_list: | |
| if cat.lower() in h1_text.lower(): | |
| return [cat] | |
| except Exception: pass | |
| if not llm_client: return ['egyéb'] | |
| try: | |
| categories_text = ", ".join([f"'{cat}'" for cat in category_list]) | |
| prompt = f"""Adott egy weboldal szövege. Adj meg egyetlen, rövid kategóriát a következő listából, ami a legjobban jellemzi a tartalmát. A válaszodban csak a kategória szerepeljen, más szöveg nélkül. | |
| Lehetséges kategóriák: {categories_text} | |
| Szöveg: {text[:1000]} | |
| Kategória:""" | |
| response = llm_client.chat.completions.create(model=CONFIG["LLM_CHUNK_MODEL"], messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=30) | |
| if response and response.choices: | |
| category = response.choices[0].message.content.strip().replace("'", "").replace("`", "") | |
| for cat in category_list: | |
| if cat.lower() in category.lower(): | |
| return [cat] | |
| except Exception as e: | |
| print(f"{RED}Hiba LLM kategorizáláskor: {e}{RESET}") | |
| return ['egyéb'] | |
| def generate_summary_with_llm(llm_client, text): | |
| if not llm_client: return text[:300] + "..." | |
| try: | |
| prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről. A lényeges pontokat emeld ki, de ne lépd túl a 200 szó terjedelmet. | |
| Szöveg: {text} | |
| Összefoglalás:""" | |
| response = llm_client.chat.completions.create(model=CONFIG["LLM_CHUNK_MODEL"], messages=[{"role": "user", "content": prompt}], temperature=0.5, max_tokens=500) | |
| if response and response.choices: | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| print(f"{RED}Hiba LLM összefoglaláskor: {e}{RESET}") | |
| return text[:300] + "..." | |
| def chunk_text_by_tokens(text, chunk_size, chunk_overlap): | |
| if not TIKTOKEN_AVAILABLE or not NLTK_AVAILABLE: | |
| chunks = []; start = 0 | |
| while start < len(text): | |
| end = start + chunk_size; chunks.append(text[start:end]); start += chunk_size - chunk_overlap | |
| return chunks | |
| tokens = tiktoken_encoder.encode(text); chunks = []; start = 0 | |
| while start < len(tokens): | |
| end = start + chunk_size; chunk_tokens = tokens[start:end]; chunks.append(tiktoken_encoder.decode(chunk_tokens)); start += chunk_size - chunk_overlap | |
| return chunks | |
| # --- Modellek és Eszközök Inicializálása --- | |
| def load_embedding_model(): | |
| global embedding_model, EMBEDDING_DIM, device | |
| if not TORCH_AVAILABLE or not SENTENCE_TRANSFORMER_AVAILABLE: EMBEDDING_DIM = 768; device = 'cpu'; return None, EMBEDDING_DIM, device | |
| if embedding_model and EMBEDDING_DIM: return embedding_model, EMBEDDING_DIM, device | |
| print(f"\n'{CONFIG['EMBEDDING_MODEL_NAME']}' modell betöltése...") | |
| try: | |
| current_device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model = SentenceTransformer(CONFIG['EMBEDDING_MODEL_NAME'], device=current_device) | |
| print(f"ST modell betöltve, eszköz: {model.device}") | |
| dim = model.get_sentence_embedding_dimension() | |
| if not dim: raise ValueError("Dim error") | |
| embedding_model = model; EMBEDDING_DIM = dim; device = current_device | |
| return embedding_model, EMBEDDING_DIM, device | |
| except Exception as e: | |
| print(f"{RED}Hiba embedding modell betöltésekor: {e}{RESET}"); traceback.print_exc() | |
| embedding_model = None; EMBEDDING_DIM = 768; device = 'cpu' | |
| return None, EMBEDDING_DIM, device | |
| embedding_model, EMBEDDING_DIM, device = load_embedding_model() | |
| # === Index Beállítások & Mapping (Szinonimák nélkül) === | |
| INDEX_SETTINGS = { | |
| "analysis": { | |
| "filter": { | |
| "hungarian_stop": {"type": "stop", "stopwords": "_hungarian_"}, | |
| "hungarian_stemmer": {"type": "stemmer", "language": "hungarian"} | |
| }, | |
| "analyzer": { | |
| "hungarian_analyzer": { | |
| "tokenizer": "standard", | |
| "filter": ["lowercase", "hungarian_stop", "hungarian_stemmer"] | |
| } | |
| } | |
| } | |
| } | |
| INDEX_MAPPINGS_WEB = { | |
| "properties": { | |
| "text_content": {"type": "text", "analyzer": "hungarian_analyzer"}, | |
| "embedding": {"type": "dense_vector", "dims": EMBEDDING_DIM, "index": True, "similarity": "cosine"}, | |
| "source_origin": {"type": "keyword"}, | |
| "source_url": {"type": "keyword"}, | |
| "source_type": {"type": "keyword"}, | |
| "category": {"type": "keyword"}, | |
| "heading": {"type": "text", "analyzer": "hungarian_analyzer"}, | |
| "summary": {"type": "text", "analyzer": "hungarian_analyzer"} | |
| } | |
| } | |
| # --- Segédfüggvények --- | |
| def initialize_es_client(): | |
| if not CONFIG["ES_CLOUD_ID"] or not CONFIG["ES_API_KEY"]: | |
| print(f"{RED}Hiba: Az ES_CLOUD_ID és ES_API_KEY környezeti változók beállítása kötelező!{RESET}") | |
| return None | |
| try: | |
| if CONFIG["DEBUG_MODE"]: print("\nKapcsolódás az Elasticsearch-hez (Cloud ID)...") | |
| client = Elasticsearch( | |
| cloud_id=CONFIG["ES_CLOUD_ID"], | |
| api_key=CONFIG["ES_API_KEY"], | |
| request_timeout=CONFIG["ES_CLIENT_TIMEOUT"] | |
| ) | |
| if client.ping(): | |
| if CONFIG["DEBUG_MODE"]: print(f"{GREEN}Sikeres Elastic Cloud kapcsolat!{RESET}") | |
| return client | |
| except Exception as e: | |
| print(f"{RED}Hiba az Elastic Cloud kapcsolat során: {e}{RESET}") | |
| return None | |
| def get_embedding(text): | |
| if not embedding_model or not text or not isinstance(text, str): return None | |
| try: | |
| return embedding_model.encode(text, normalize_embeddings=True).tolist() | |
| except Exception as e: | |
| print(f"{RED}Hiba embedding közben: {e}{RESET}"); return None | |
| def create_es_index(client, index_name, index_settings, index_mappings): | |
| if not EMBEDDING_DIM: | |
| print(f"{RED}Hiba: Embedding dimenzió nincs beállítva.{RESET}") | |
| return False | |
| try: | |
| index_mappings["properties"]["embedding"]["dims"] = EMBEDDING_DIM | |
| except KeyError: | |
| print(f"{RED}Hiba: Érvénytelen mapping struktúra.{RESET}") | |
| return False | |
| try: | |
| if not client.indices.exists(index=index_name): | |
| print(f"'{index_name}' index létrehozása...") | |
| client.indices.create(index=index_name, settings=index_settings, mappings=index_mappings) | |
| print(f"{GREEN}Index sikeresen létrehozva.{RESET}") | |
| time.sleep(2) | |
| else: | |
| if CONFIG["DEBUG_MODE"]: print(f"Index '{index_name}' már létezik.") | |
| return True | |
| except Exception as e: | |
| print(f"{RED}Hiba az index létrehozása során: {e}{RESET}") | |
| traceback.print_exc() | |
| return False | |
| def extract_text_from_html(html_content): | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]): | |
| if element: element.decompose() | |
| main_content = soup.find('main') or soup.find('article') or soup.body | |
| if main_content: | |
| return "\n".join(line for line in main_content.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
| except Exception as e: | |
| print(f"{RED}Hiba a HTML szöveg kinyerése során: {e}{RESET}") | |
| return "" | |
| def extract_and_filter_links(soup, base_url, target_domain): | |
| links = set() | |
| try: | |
| for a_tag in soup.find_all('a', href=True): | |
| href = a_tag['href'].strip() | |
| if href and not href.startswith(('#', 'mailto:', 'javascript:')): | |
| full_url = urljoin(base_url, href) | |
| parsed_url = urlparse(full_url) | |
| if parsed_url.scheme in ['http', 'https'] and parsed_url.netloc == target_domain: | |
| links.add(parsed_url._replace(fragment="").geturl()) | |
| except Exception as e: | |
| print(f"{RED}Hiba a linkek kinyerése során: {e}{RESET}") | |
| return links | |
| def crawl_and_index_website(start_url, max_depth, es_client, index_name): | |
| if not es_client or not embedding_model: return 0 | |
| visited_urls, urls_to_visit = set(), deque([(start_url, 0)]) | |
| bulk_actions = [] | |
| total_prepared, total_indexed = 0, 0 | |
| target_domain = urlparse(start_url).netloc | |
| print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})") | |
| while urls_to_visit: | |
| current_url = None | |
| try: | |
| current_url, current_depth = urls_to_visit.popleft() | |
| if current_url in visited_urls or current_depth > max_depth: continue | |
| print(f"\n--- Feldolgozás (Mélység: {current_depth}): {current_url} ---") | |
| visited_urls.add(current_url) | |
| try: | |
| headers = {'User-Agent': CONFIG["USER_AGENT"]} | |
| response = requests.get(current_url, headers=headers, timeout=15) | |
| response.raise_for_status() | |
| if 'text/html' not in response.headers.get('content-type', '').lower(): | |
| print(f" {YELLOW}-> Nem HTML tartalom, kihagyva.{RESET}"); continue | |
| html_content = response.content | |
| except requests.exceptions.RequestException as req_err: | |
| print(f" {RED}!!! Hiba a letöltés során: {req_err}{RESET}"); continue | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| page_text = extract_text_from_html(html_content) | |
| if not page_text or len(page_text) < CONFIG["MIN_CHUNK_SIZE_CHARS"]: | |
| print(f" {YELLOW}-> Túl rövid szöveg, kihagyva.{RESET}"); continue | |
| final_chunks = chunk_text_by_tokens(page_text, CONFIG["CHUNK_SIZE_TOKENS"], CONFIG["CHUNK_OVERLAP_TOKENS"]) | |
| url_category = generate_categories_with_llm(together_client, soup, page_text)[0] | |
| page_summary = generate_summary_with_llm(together_client, page_text) | |
| if not final_chunks: continue | |
| for chunk_text in final_chunks: | |
| element_vector = get_embedding(chunk_text) | |
| if element_vector: | |
| total_prepared += 1 | |
| doc = {"text_content": chunk_text, "embedding": element_vector, "source_origin": "website", "source_url": current_url, "source_type": "token_chunking", "category": url_category, "summary": page_summary} | |
| bulk_actions.append({"_index": index_name, "_source": doc}) | |
| if len(bulk_actions) >= CONFIG["BATCH_SIZE"]: | |
| success_count, errors = helpers.bulk(es_client, bulk_actions, raise_on_error=False, request_timeout=CONFIG["ES_CLIENT_TIMEOUT"]) | |
| total_indexed += success_count; bulk_actions = [] | |
| if errors: print(f"{RED}!!! Hiba a bulk indexelés során: {len(errors)} sikertelen.{RESET}") | |
| if current_depth < max_depth: | |
| new_links = extract_and_filter_links(soup, current_url, target_domain) | |
| for link in new_links: | |
| if link not in visited_urls: urls_to_visit.append((link, current_depth + 1)) | |
| time.sleep(CONFIG['REQUEST_DELAY']) | |
| except KeyboardInterrupt: print("\nFolyamat megszakítva."); break | |
| except Exception as loop_err: print(f"{RED}!!! Hiba a ciklusban ({current_url}): {loop_err}{RESET}"); traceback.print_exc(); time.sleep(5) | |
| if bulk_actions: | |
| success_count, errors = helpers.bulk(es_client, bulk_actions, raise_on_error=False, request_timeout=CONFIG["ES_CLIENT_TIMEOUT"]) | |
| total_indexed += success_count | |
| if errors: print(f"{RED}!!! Hiba a maradék indexelése során: {len(errors)} sikertelen.{RESET}") | |
| print(f"\n--- Web Crawling Befejezve ---") | |
| print(f"Meglátogatott URL-ek: {len(visited_urls)}") | |
| print(f"Előkészített chunk-ok: {total_prepared}") | |
| print(f"Sikeresen indexelt chunk-ok: {total_indexed}") | |
| return total_indexed | |
| # --- Fő futtatási blokk --- | |
| if __name__ == "__main__": | |
| print(f"----- Web Crawler és Indexelő Indítása a '{CONFIG['VECTOR_INDEX_NAME']}' indexbe -----") | |
| print(f"----- Cél URL: {CONFIG['START_URL']} (Max mélység: {CONFIG['MAX_DEPTH']}) -----") | |
| print("****** FIGYELEM ******") | |
| print(f"Ez a script létrehozza/használja a '{CONFIG['VECTOR_INDEX_NAME']}' indexet.") | |
| print(f"{RED}Ha a '{CONFIG['VECTOR_INDEX_NAME']}' index már létezik, TÖRÖLD manuálisan futtatás előtt!{RESET}") | |
| print("********************") | |
| if not all([TORCH_AVAILABLE, SENTENCE_TRANSFORMER_AVAILABLE, embedding_model, EMBEDDING_DIM]): | |
| print(f"{RED}Hiba: AI modellek hiányoznak. Leállás.{RESET}"); exit(1) | |
| if not CONFIG["TOGETHER_API_KEY"]: | |
| print(f"{RED}Hiba: TOGETHER_API_KEY hiányzik. Leállás.{RESET}"); exit(1) | |
| es_client = initialize_es_client() | |
| if not es_client: | |
| print(f"{RED}Hiba: Elasticsearch kliens inicializálása sikertelen. Leállás.{RESET}"); exit(1) | |
| final_success_count = 0 | |
| index_ready = create_es_index( | |
| client=es_client, | |
| index_name=CONFIG["VECTOR_INDEX_NAME"], | |
| index_settings=INDEX_SETTINGS, | |
| index_mappings=INDEX_MAPPINGS_WEB | |
| ) | |
| if index_ready: | |
| print(f"\nIndex '{CONFIG['VECTOR_INDEX_NAME']}' kész. Crawling indítása...") | |
| final_success_count = crawl_and_index_website( | |
| start_url=CONFIG["START_URL"], | |
| max_depth=CONFIG["MAX_DEPTH"], | |
| es_client=es_client, | |
| index_name=CONFIG["VECTOR_INDEX_NAME"] | |
| ) | |
| else: | |
| print(f"{RED}Hiba: Index létrehozása sikertelen. Leállás.{RESET}") | |
| print("\n----- Feldolgozás Befejezve -----") | |
| if index_ready and final_success_count > 0: | |
| print(f"\n{GREEN}Sikeres. {final_success_count} chunk indexelve '{CONFIG['VECTOR_INDEX_NAME']}'-be.{RESET}") | |
| elif index_ready and final_success_count == 0: | |
| print(f"{YELLOW}Crawling lefutott, de 0 chunk lett indexelve.{RESET}") | |
| else: | |
| print(f"{RED}A folyamat hibával zárult.{RESET}") |