""" RAGDocumentManager - Business logic per gestione documenti RAG """ import os from typing import List, Optional from dataclasses import dataclass from datetime import datetime # Cache embeddings per-profile (dict keyed by profile_name) # Permette isolamento perfetto e supporto futuri modelli diversi per profilo _EMBEDDINGS_CACHE = {} @dataclass class DocumentInfo: """Info documento RAG""" filename: str # es: "meditazioni.pdf" (senza hash) title: str # Da metadata PDF author: str # Da metadata PDF chunk_count: int # Numero chunks child in ChromaDB file_size: int # Bytes indexed_date: float # Unix timestamp source_path: str # Path completo con hash class RAGDocumentManager: """Gestisce operazioni CRUD su documenti RAG per un profilo""" def __init__(self, profile_name: str, project_root: str): self.profile_name = profile_name self.project_root = project_root # Paths self.chroma_path = os.path.join( project_root, 'data', 'agents', profile_name, 'chroma_db' ) self.docstore_path = os.path.join( project_root, 'data', 'agents', profile_name, 'doc_store' ) self.source_docs_path = os.path.join( project_root, 'data', 'agents', profile_name, 'source_docs' ) # Embeddings lazy-loaded quando necessario self._embeddings = None def _get_embeddings(self): """Lazy load embeddings (cache per-profile)""" global _EMBEDDINGS_CACHE # Controlla cache per questo specifico profilo if self.profile_name in _EMBEDDINGS_CACHE: return _EMBEDDINGS_CACHE[self.profile_name] # Altrimenti crea nuova istanza if self._embeddings is None: try: from langchain_huggingface import HuggingFaceEmbeddings print(f"[RAGDocumentManager] Caricamento embeddings per profilo: {self.profile_name}") self._embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) # Salva in cache per questo profilo _EMBEDDINGS_CACHE[self.profile_name] = self._embeddings except ImportError as e: raise ImportError( "Dipendenze RAG non installate. " "Installa con: pip install langchain-huggingface sentence-transformers" ) from e return self._embeddings def list_documents(self) -> List[DocumentInfo]: """ Lista tutti i documenti indicizzati per il profilo Returns: Lista DocumentInfo ordinata per data (più recenti prima) Lista vuota se nessun documento o ChromaDB non esiste """ if not os.path.exists(self.chroma_path): return [] try: # Lazy import per evitare crash se dipendenze non installate from langchain_chroma import Chroma # Carica ChromaDB vectorstore = Chroma( collection_name="split_parents", persist_directory=self.chroma_path, embedding_function=self._get_embeddings() ) collection = vectorstore._collection results = collection.get(include=['metadatas']) if not results['metadatas']: return [] # Estrai unique sources unique_sources = {} for meta in results['metadatas']: source_path = meta.get('source', '') if source_path and source_path not in unique_sources: unique_sources[source_path] = meta # Crea DocumentInfo per ogni source documents = [] for source_path, meta in unique_sources.items(): # Count chunks per questo source doc_results = collection.get( where={"source": {"$eq": source_path}}, include=[] ) # File info file_size = 0 indexed_date = 0 if os.path.exists(source_path): file_size = os.path.getsize(source_path) indexed_date = os.path.getmtime(source_path) # Estrai filename senza hash (formato: hash__filename.pdf) basename = os.path.basename(source_path) filename = basename.split('__', 1)[1] if '__' in basename else basename documents.append(DocumentInfo( filename=filename, title=meta.get('title', ''), author=meta.get('author', ''), chunk_count=len(doc_results['ids']), file_size=file_size, indexed_date=indexed_date, source_path=source_path )) # Ordina per data (più recenti prima) return sorted(documents, key=lambda d: d.indexed_date, reverse=True) except Exception as e: print(f"[RAGDocumentManager] Errore list_documents: {e}") return [] def delete_document(self, source_path: str) -> bool: """ Elimina documento e tutti i suoi chunks Rimuove: 1. Vectors da ChromaDB 2. Parent chunks da doc_store 3. File originale da source_docs Returns: True se successo, False altrimenti """ try: # Lazy import per evitare crash se dipendenze non installate from langchain_chroma import Chroma # 1. Delete vectors from ChromaDB vectorstore = Chroma( collection_name="split_parents", persist_directory=self.chroma_path, embedding_function=self._get_embeddings() ) collection = vectorstore._collection results = collection.get( where={"source": {"$eq": source_path}}, include=['ids'] ) if results['ids']: collection.delete(ids=results['ids']) print(f"[RAGDocumentManager] Deleted {len(results['ids'])} vectors from ChromaDB") # 2. Delete parent chunks from doc_store deleted_chunks = 0 for chunk_id in results['ids']: doc_file = os.path.join(self.docstore_path, chunk_id) if os.path.exists(doc_file): os.remove(doc_file) deleted_chunks += 1 print(f"[RAGDocumentManager] Deleted {deleted_chunks} doc_store files") # 3. Delete original PDF from source_docs if os.path.exists(source_path): os.remove(source_path) print(f"[RAGDocumentManager] Deleted source: {source_path}") return True except Exception as e: print(f"[RAGDocumentManager] Errore delete_document: {e}") return False def get_stats(self) -> dict: """Statistiche knowledge base del profilo""" docs = self.list_documents() total_size = sum(d.file_size for d in docs) total_chunks = sum(d.chunk_count for d in docs) return { 'document_count': len(docs), 'total_chunks': total_chunks, 'total_size_bytes': total_size, 'total_size_mb': round(total_size / (1024 * 1024), 2) }