"""Splitter module for extracting files from aggregated markdown file.""" import re import os from pathlib import Path from typing import Callable, Optional, Dict from .exceptions import InvalidFileError, FileOperationError, ParseError class Splitter: """Splits an aggregated file back into individual .md files.""" SEPARATOR_PATTERN = r"^--- FILE: (.+) ---$" def __init__(self): """Initialize the Splitter.""" pass def split( self, aggregated_file: str, output_dir: str, progress_callback: Optional[Callable[[int, str], None]] = None ) -> Dict[str, any]: """ Split an aggregated file into individual .md files. Args: aggregated_file: Path to aggregated file output_dir: Directory to extract files to progress_callback: Optional callback function(percentage, message) Returns: Dictionary with statistics (files_created, directories_created, etc.) Raises: InvalidFileError: If aggregated file doesn't exist ParseError: If file format is invalid FileOperationError: If file operations fail """ agg_path = Path(aggregated_file) out_path = Path(output_dir) # Validate aggregated file if not agg_path.exists(): raise InvalidFileError(f"Aggregated file does not exist: {aggregated_file}") if not agg_path.is_file(): raise InvalidFileError(f"Path is not a file: {aggregated_file}") # Create output directory if it doesn't exist try: out_path.mkdir(parents=True, exist_ok=True) except Exception as e: raise FileOperationError(f"Failed to create output directory: {e}") # Parse and extract files try: files_created = 0 directories_created = set() current_file = None current_content = [] with open(agg_path, 'r', encoding='utf-8') as f: total_lines = sum(1 for _ in open(agg_path, 'r', encoding='utf-8')) line_number = 0 for line in f: line_number += 1 # Update progress if progress_callback and line_number % 100 == 0: percentage = int((line_number / total_lines) * 100) progress_callback(percentage, f"Parsing line {line_number}/{total_lines}") # Check if line is a separator match = re.match(self.SEPARATOR_PATTERN, line.strip()) if match: # Save previous file if exists if current_file: result = self._write_file(out_path, current_file, current_content) if result["created"]: files_created += 1 if result["dir_created"]: directories_created.add(result["dir_created"]) # Start new file current_file = match.group(1) current_content = [] else: # Accumulate content for current file if current_file is not None: current_content.append(line) # Save last file if current_file: result = self._write_file(out_path, current_file, current_content) if result["created"]: files_created += 1 if result["dir_created"]: directories_created.add(result["dir_created"]) # Final progress update if progress_callback: progress_callback(100, "Split complete") if files_created == 0: raise ParseError("No valid file separators found in aggregated file") return { "files_created": files_created, "directories_created": len(directories_created), "aggregated_file": aggregated_file, "output_dir": output_dir } except ParseError: raise except Exception as e: raise FileOperationError(f"Failed to split file: {e}") def _write_file( self, output_dir: Path, relative_path: str, content_lines: list ) -> Dict[str, any]: """ Write a file to the output directory. Args: output_dir: Base output directory relative_path: Relative path for the file content_lines: List of content lines Returns: Dictionary with created status and directory created Raises: FileOperationError: If file write fails """ try: # Convert path separators to system-specific file_path = output_dir / relative_path # Create parent directories if needed dir_created = None if not file_path.parent.exists(): file_path.parent.mkdir(parents=True, exist_ok=True) dir_created = str(file_path.parent) # Write content content = ''.join(content_lines) # Remove trailing blank lines added during aggregation content = content.rstrip('\n') if content: # Only add final newline if content exists content += '\n' file_path.write_text(content, encoding='utf-8') return { "created": True, "dir_created": dir_created, "file_path": str(file_path) } except Exception as e: raise FileOperationError(f"Failed to write file {relative_path}: {e}")