#!/usr/bin/env python3 import os import sys # Aggiungi il path del progetto project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, project_root) from dotenv import load_dotenv from langchain_community.document_loaders import TextLoader from langchain_chroma import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.storage import LocalFileStore, EncoderBackedStore from langchain.retrievers import ParentDocumentRetriever from langchain.text_splitter import RecursiveCharacterTextSplitter import pickle # Carica env env_path = os.path.join(project_root, 'data', '.env') if os.path.exists(env_path): load_dotenv(dotenv_path=env_path) def main(txt_file): print(f"[INDICIZZATORE TXT] Indicizzazione: {os.path.basename(txt_file)}") # Paths vectorstore_path = os.path.join(project_root, 'data', 'agents', 'aurelio', 'chroma_db') docstore_path = os.path.join(project_root, 'data', 'agents', 'aurelio', 'doc_store') # 1. Carica documento TXT print("1/4: Caricamento file TXT...") loader = TextLoader(txt_file, encoding='utf-8') docs = loader.load() print(f" Caricato: {len(docs)} documento/i") # 2. Setup embeddings print("2/4: Inizializzazione embeddings...") embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') # 3. Setup stores print("3/4: Setup vectorstore...") vectorstore = Chroma( collection_name="split_parents", embedding_function=embeddings, persist_directory=vectorstore_path ) fs = LocalFileStore(docstore_path) store = EncoderBackedStore(fs, lambda key: key, pickle.dumps, pickle.loads) # 4. Setup retriever parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=40) retriever = ParentDocumentRetriever( vectorstore=vectorstore, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter, ) # 5. Indicizza print("4/4: Indicizzazione...") retriever.add_documents(docs, ids=None) print("✅ Indicizzazione TXT completata!") if __name__ == "__main__": if len(sys.argv) < 2: print("Uso: python indicizza_txt.py ") sys.exit(1) main(sys.argv[1])