Skip to main content
← OpenMECP Documentation

omecp/
cleanup.rs

1//! Automated file cleanup for quantum chemistry calculations.
2//!
3//! This module provides functionality to automatically clean up temporary files
4//! generated during quantum chemistry calculations, particularly for ORCA calculations.
5//! The cleanup system uses a smart approach to prevent bus errors from excessive
6//! temporary files while preserving all essential files.
7//!
8//! # Philosophy: Smart File Management
9//!
10//! This implementation uses an intelligent file management strategy:
11//! - **Always keep**: Output files (.out, .log) and input files (.in, .inp)
12//! - **Latest only**: Energy/gradient files (.engrad) - only from the most recent step
13//! - **Configurable**: User-specified extensions from omecp_config.cfg
14//! - **Delete everything else**: All temporary and intermediate files
15//!
16//! This prevents bus errors from accumulating thousands of temporary files
17//! during long MECP optimization runs.
18//!
19//! # Features
20//!
21//! - **Automatic cleanup** after each QM calculation completes
22//! - **Step-based .engrad filtering** - keeps only the latest .engrad files
23//! - **Whitelist preservation** - preserves essential output and input files
24//! - **Configurable via omecp_config.cfg** for output file extensions
25//! - **Program-specific handling** (ORCA, Gaussian, XTB, etc.)
26//! - **Comprehensive logging** of all cleanup operations
27//! - **Safe operations** with proper error handling
28//!
29//! # Configuration
30//!
31//! The cleanup behavior is controlled via omecp_config.cfg. Add the cleanup
32//! section to your omecp_config.cfg file:
33//!
34//! ```ini
35//! [cleanup]
36//! # Enable or disable automatic cleanup (default: true)
37//! enabled = true
38//!
39//! # Verbose logging for cleanup operations (default: 1)
40//! # 0 = quiet, 1 = normal, 2 = verbose
41//! verbose = 1
42//!
43//! # Additional file extensions to preserve (comma-separated)
44//! # preserve_extensions = gbw,tmp,backup
45//! ```
46//!
47//! Note: Output file extension is controlled by the `[extensions]` section:
48//! ```ini
49//! [extensions]
50//! orca = out        # All .out files will be preserved
51//! ```
52//!
53//! # File Preservation Strategy
54//!
55//! ## Files Always Preserved (Never Deleted)
56//! - **Output files** (.out, .log, etc.) - Calculation results
57//! - **Input files** (.in, .inp) - Input files for calculations
58//!
59//! ## Files Latest Only (Step-Based Filtering)
60//! - **Energy/gradient files** (.engrad) - Only from the most recent optimization step
61//!   - Format: `{N}_state_{A|B}.engrad` where N is the step number
62//!   - Keeps: Files with maximum step number (e.g., `60_state_A.engrad`)
63//!   - Deletes: All other .engrad files (e.g., `59_state_*.engrad`, `58_state_*.engrad`, etc.)
64//!
65//! ## User-Configurable Extensions
66//! - Additional file extensions specified in `omecp_config.cfg` under `[cleanup]` section
67//! - All files with these extensions are preserved
68//!
69//! ## Files Always Deleted
70//! - SCF iteration files (.scf)
71//! - Temporary files (.tmp, .trash)
72//! - Lock files (.lock)
73//! - Old .engrad files (older than the latest step)
74//! - Any other file types not in the whitelist
75//!
76//! # Usage Example
77//!
78//! ```rust
79//! use omecp::cleanup::{CleanupManager, CleanupConfig};
80//! use omecp::settings::SettingsManager;
81//! use omecp::config::QMProgram;
82//! use std::path::Path;
83//!
84//! let settings_manager = SettingsManager::load()?;
85//! let program = QMProgram::Orca;
86//! let cleanup_config = CleanupConfig::from_settings_manager(&settings_manager, program);
87//!
88//! let manager = CleanupManager::new(cleanup_config, program);
89//! // Clean up files in the job directory
90//! manager.cleanup_directory(Path::new("compound_x"))?;
91//! ```
92//!
93//! # Error Handling
94//!
95//! All cleanup operations return proper `Result` types and log errors
96//! without panicking. This ensures that cleanup failures don't interrupt
97//! the main calculation workflow.
98
99use crate::config::QMProgram;
100use log::{debug, error, info, warn};
101use regex::Regex;
102use std::fs;
103use std::path::Path;
104use thiserror::Error;
105
106/// Errors that can occur during cleanup operations.
107#[derive(Error, Debug)]
108pub enum CleanupError {
109    /// I/O error during file operations
110    #[error("IO error: {0}")]
111    Io(#[from] std::io::Error),
112
113    /// Invalid path error
114    #[error("Invalid path: {0}")]
115    InvalidPath(String),
116}
117
118/// Result type for cleanup operations
119pub type Result<T> = std::result::Result<T, CleanupError>;
120
121/// Configuration for cleanup operations.
122#[derive(Debug, Clone)]
123pub struct CleanupConfig {
124    /// Enable automatic cleanup
125    pub enabled: bool,
126
127    /// File extensions to preserve (whitelist)
128    pub preserve_extensions: Vec<String>,
129
130    /// Verbosity level for cleanup logging
131    pub verbose: u32,
132
133    /// Perform cleanup every N optimization steps (default: 5)
134    /// Set to 0 to disable periodic cleanup
135    pub cleanup_frequency: u32,
136
137    /// Global print level from general settings (0=quiet, 1=normal, 2=verbose)
138    pub print_level: u32,
139}
140
141impl Default for CleanupConfig {
142    fn default() -> Self {
143        Self {
144            enabled: true,
145            preserve_extensions: Vec::new(),
146            verbose: 1,
147            cleanup_frequency: 5,
148            print_level: 0,
149        }
150    }
151}
152
153impl CleanupConfig {
154    /// Creates a cleanup configuration from settings manager.
155    ///
156    /// This integrates with omecp_config.cfg to get the user-specified output
157    /// extension for the QM program and adds it to the whitelist.
158    ///
159    /// # Arguments
160    ///
161    /// * `settings_manager` - Settings manager containing configuration
162    /// * `program` - QM program type
163    ///
164    /// # Returns
165    ///
166    /// Returns a CleanupConfig with the whitelist of files to preserve
167    pub fn from_settings_manager(
168        settings_manager: &crate::settings::SettingsManager,
169        program: QMProgram,
170    ) -> Self {
171        let settings = &settings_manager.settings();
172
173        // Get base config from settings
174        let mut config = CleanupConfig {
175            enabled: settings.cleanup.enabled,
176            preserve_extensions: settings.cleanup.preserve_extensions.clone(),
177            verbose: settings.cleanup.verbose,
178            cleanup_frequency: settings.cleanup.cleanup_frequency,
179            print_level: settings.general.print_level,
180        };
181
182        // Get user-specified output extension for this program
183        let user_ext = settings_manager.get_output_extension(program);
184
185        // Add user-specified output extension to whitelist (always preserve it)
186        if !user_ext.is_empty() && !config.preserve_extensions.iter().any(|s| s == user_ext) {
187            config.preserve_extensions.push(user_ext.to_string());
188        }
189
190        // Add program-specific essential files to whitelist
191        // Note: .engrad is handled separately with step-based filtering
192        let essential_extensions = match program {
193            // Gaussian: only needs output extension
194            QMProgram::Gaussian => vec![],
195            // ORCA: needs .gbw in addition to output (engrad is step-filtered)
196            QMProgram::Orca => vec!["gbw".to_string()],
197            // XTB: only needs output extension
198            QMProgram::Xtb => vec![],
199            // BAGEL: only needs output extension
200            QMProgram::Bagel => vec![],
201            // Custom: only needs output extension
202            QMProgram::Custom => vec![],
203        };
204
205        // Add essential extensions to whitelist
206        for ext in essential_extensions {
207            if !config.preserve_extensions.contains(&ext) {
208                config.preserve_extensions.push(ext);
209            }
210        }
211
212        if config.verbose >= 2 {
213            info!("Cleanup configuration for {:?}:", program);
214            info!("  Enabled: {}", config.enabled);
215            info!("  User output extension: {}", user_ext);
216            info!("  Whitelist extensions: {:?}", config.preserve_extensions);
217        }
218
219        config
220    }
221
222    /// Gets the list of preserve extensions
223    pub fn get_preserve_extensions(&self) -> &[String] {
224        &self.preserve_extensions
225    }
226
227    /// Checks if cleanup is enabled
228    pub fn is_enabled(&self) -> bool {
229        self.enabled
230    }
231
232    /// Gets verbosity level
233    pub fn verbosity(&self) -> u32 {
234        self.verbose
235    }
236
237    /// Gets cleanup frequency (every N steps)
238    pub fn cleanup_frequency(&self) -> u32 {
239        self.cleanup_frequency
240    }
241
242    /// Checks if logging should occur based on print_level and verbose settings.
243    ///
244    /// This combines the global print_level with the cleanup-specific verbose setting:
245    /// - If print_level is 0 (quiet), no cleanup messages are printed regardless of verbose
246    /// - If print_level is 1 (normal), messages are printed based on verbose level
247    /// - If print_level is 2 (verbose), all messages are printed
248    ///
249    /// # Arguments
250    ///
251    /// * `min_verbose_level` - Minimum verbose level required (0, 1, or 2)
252    ///
253    /// # Returns
254    ///
255    /// Returns `true` if logging should occur, `false` otherwise
256    pub fn should_log(&self, min_verbose_level: u32) -> bool {
257        // If global print_level is 0 (quiet), suppress all cleanup output
258        if self.print_level == 0 {
259            return false;
260        }
261
262        // If global print_level is 2 (verbose), allow all messages
263        if self.print_level >= 2 {
264            return true;
265        }
266
267        // If global print_level is 1 (normal), check verbose level
268        // verbose = 0: quiet, verbose = 1: normal, verbose = 2: verbose
269        self.verbose >= min_verbose_level
270    }
271}
272
273/// Manages cleanup operations for quantum chemistry calculations.
274pub struct CleanupManager {
275    /// Cleanup configuration with whitelist
276    config: CleanupConfig,
277
278    /// QM program type
279    program: QMProgram,
280}
281
282impl CleanupManager {
283    /// Creates a new cleanup manager.
284    ///
285    /// # Arguments
286    ///
287    /// * `config` - Cleanup configuration with whitelist
288    /// * `program` - QM program type
289    ///
290    /// # Returns
291    ///
292    /// Returns a new CleanupManager instance
293    pub fn new(config: CleanupConfig, program: QMProgram) -> Self {
294        Self { config, program }
295    }
296
297    /// Cleans up temporary files in the specified directory.
298    ///
299    /// Uses a smart approach: preserves essential files and keeps only the
300    /// latest .engrad files to prevent bus errors from excessive files.
301    ///
302    /// # Arguments
303    ///
304    /// * `directory` - Path to the directory to clean (e.g., job directory from input file stem)
305    ///
306    /// # Returns
307    ///
308    /// Returns `Ok(())` on success or a `CleanupError` on failure
309    pub fn cleanup_directory(&self, directory: &Path) -> Result<()> {
310        if !self.config.enabled {
311            if self.config.should_log(1) {
312                info!(
313                    "Cleanup is disabled, skipping directory: {}",
314                    directory.display()
315                );
316            }
317            return Ok(());
318        }
319
320        if !directory.exists() {
321            if self.config.should_log(2) {
322                debug!(
323                    "Directory does not exist, skipping: {}",
324                    directory.display()
325                );
326            }
327            return Ok(());
328        }
329
330        if !directory.is_dir() {
331            return Err(CleanupError::InvalidPath(format!(
332                "Path is not a directory: {}",
333                directory.display()
334            )));
335        }
336
337        if self.config.should_log(2) {
338            info!("Starting cleanup in directory: {}", directory.display());
339            info!(
340                "Preserving files with extensions: {:?}",
341                self.config.preserve_extensions
342            );
343        }
344
345        // Read all directory entries
346        let entries = fs::read_dir(directory).map_err(CleanupError::Io)?;
347
348        let mut all_files = Vec::new();
349        for entry in entries {
350            match entry {
351                Ok(entry) => {
352                    let path = entry.path();
353                    let filename = path
354                        .file_name()
355                        .and_then(|s| s.to_str())
356                        .unwrap_or("")
357                        .to_string();
358
359                    // Skip hidden files and directories
360                    if !filename.starts_with('.') && !path.is_dir() {
361                        all_files.push((path, filename));
362                    }
363                }
364                Err(e) => {
365                    warn!("Error reading directory entry: {}", e);
366                }
367            }
368        }
369
370        // Find the maximum step number from .inp files
371        let max_step = self.find_max_step_number(&all_files);
372
373        let mut cleaned_files = Vec::new();
374        let mut preserved_files = Vec::new();
375        let mut errors = Vec::new();
376
377        for (path, filename) in all_files {
378            let extension = path.extension().and_then(|s| s.to_str()).unwrap_or("");
379
380            // Check if this file should be preserved
381            if self.should_preserve_file(extension, &path, &filename, max_step) {
382                preserved_files.push(path.clone());
383                    if self.config.should_log(2) {
384                        debug!("Preserving file: {}", path.display());
385                    }
386            } else {
387                // Delete the file
388                match fs::remove_file(&path) {
389                    Ok(_) => {
390                        cleaned_files.push(path.clone());
391                        if self.config.should_log(2) {
392                            info!("Cleaned up file: {}", path.display());
393                        }
394                    }
395                    Err(e) => {
396                        warn!("Failed to remove file {}: {}", path.display(), e);
397                        errors.push((path.clone(), e));
398                    }
399                }
400            }
401        }
402
403        if self.config.should_log(2) {
404            info!(
405                "Cleanup completed: {} files deleted, {} files preserved",
406                cleaned_files.len(),
407                preserved_files.len()
408            );
409        }
410        if self.config.should_log(1) && max_step > 0 {
411            debug!("Latest step number: {}", max_step);
412        }
413
414        if !errors.is_empty() {
415            error!("Cleanup completed with {} errors", errors.len());
416        }
417
418        Ok(())
419    }
420
421    /// Finds the maximum step number from .inp files in the directory.
422    ///
423    /// Scans for files matching the pattern `{N}_state_{A|B}.inp` and returns
424    /// the maximum step number N found.
425    ///
426    /// # Arguments
427    ///
428    /// * `files` - List of (path, filename) tuples
429    ///
430    /// # Returns
431    ///
432    /// Returns the maximum step number found, or 0 if no step-based files exist
433    fn find_max_step_number(&self, files: &[(std::path::PathBuf, String)]) -> usize {
434        let inp_regex = Regex::new(r"^(\d+)_state_[AB]\.inp$").unwrap();
435        let mut max_step = 0;
436
437        for (_, filename) in files {
438            if let Some(caps) = inp_regex.captures(filename) {
439                if let Ok(step) = caps[1].parse::<usize>() {
440                    if step > max_step {
441                        max_step = step;
442                    }
443                }
444            }
445        }
446
447        max_step
448    }
449
450    /// Extracts the step number from a .engrad filename.
451    ///
452    /// Parses files matching the pattern `{N}_state_{A|B}.engrad` and returns
453    /// the step number N.
454    ///
455    /// # Arguments
456    ///
457    /// * `filename` - The filename to parse
458    ///
459    /// # Returns
460    ///
461    /// Returns Some(step_number) if the filename matches the pattern, None otherwise
462    fn extract_step_from_engrad(&self, filename: &str) -> Option<usize> {
463        let engrad_regex = Regex::new(r"^(\d+)_state_[AB]\.engrad$").unwrap();
464        engrad_regex
465            .captures(filename)
466            .and_then(|caps| caps[1].parse::<usize>().ok())
467    }
468
469    /// Determines if a file should be preserved (whitelist check with step-based filtering).
470    ///
471    /// Files are preserved based on:
472    /// 1. Extension in whitelist (always keep)
473    /// 2. Special filename patterns (always keep)
474    /// 3. .engrad files from the latest step (keep only)
475    /// 4. All other files (delete)
476    ///
477    /// # Arguments
478    ///
479    /// * `extension` - File extension without the dot
480    /// * `path` - Full file path
481    /// * `filename` - Just the filename
482    /// * `max_step` - Maximum step number from .inp files
483    ///
484    /// # Returns
485    ///
486    /// Returns `true` if the file should be preserved, `false` otherwise
487    pub fn should_preserve_file(
488        &self,
489        extension: &str,
490        _path: &Path,
491        filename: &str,
492        max_step: usize,
493    ) -> bool {
494        // Always preserve essential file types
495        if extension == "out" || extension == "log" || extension == "in" || extension == "inp" {
496            return true;
497        }
498
499        // Preserve input.inp (even without .inp extension check)
500        if filename == "input.inp" {
501            return true;
502        }
503
504        // Special handling for .engrad files - only keep from the latest step
505        if extension == "engrad" {
506            if let Some(step) = self.extract_step_from_engrad(filename) {
507                // Keep only .engrad files from the maximum step
508                return step == max_step;
509            }
510            // .engrad files that don't match the pattern should be deleted
511            return false;
512        }
513
514        // Whitelist check: preserve if extension is in our list
515        if self
516            .config
517            .preserve_extensions
518            .iter()
519            .any(|ext| ext == extension)
520        {
521            return true;
522        }
523
524        // Program-specific preservation of special files
525        match self.program {
526            QMProgram::Orca => {
527                // ORCA checkpoint and state files (.gbw) - already in whitelist
528            }
529
530            QMProgram::Gaussian => {
531                // Gaussian checkpoint files (.chk) - already in whitelist
532            }
533
534            _ => {
535                // Other programs
536            }
537        }
538
539        // Not in whitelist, should be deleted
540        false
541    }
542
543    /// Cleans up a single file if it's not in the whitelist.
544    ///
545    /// Note: For .engrad files, this method conservatively deletes them unless
546    /// they are from the latest step. To use step-based filtering, use
547    /// cleanup_directory() instead.
548    ///
549    /// # Arguments
550    ///
551    /// * `file_path` - Path to the file to clean
552    ///
553    /// # Returns
554    ///
555    /// Returns `Ok(true)` if file was deleted, `Ok(false)` if preserved
556    pub fn cleanup_file(&self, file_path: &Path) -> Result<bool> {
557        if !self.config.enabled {
558            return Ok(false);
559        }
560
561        if !file_path.exists() || file_path.is_dir() {
562            return Ok(false);
563        }
564
565        let filename = file_path.file_name().and_then(|s| s.to_str()).unwrap_or("");
566
567        let extension = file_path.extension().and_then(|s| s.to_str()).unwrap_or("");
568
569        // For .engrad files, conservatively delete them unless we can determine
570        // they should be preserved (use cleanup_directory for step-based filtering)
571        if extension == "engrad" {
572            // .engrad files are deleted by default (not in preserve_extensions)
573            // unless cleanup_directory determines they should be kept
574            if self.config.should_log(2) {
575                debug!(
576                    "Deleting .engrad file (use cleanup_directory for step-based filtering): {}",
577                    file_path.display()
578                );
579            }
580            match fs::remove_file(file_path) {
581                Ok(_) => {
582                    if self.config.should_log(2) {
583                        info!("Cleaned up file: {}", file_path.display());
584                    }
585                    Ok(true)
586                }
587                Err(e) => {
588                    error!("Failed to remove file {}: {}", file_path.display(), e);
589                    Err(CleanupError::Io(e))
590                }
591            }
592        } else {
593            // For non-.engrad files, use normal whitelist check
594            if self.should_preserve_file_simple(extension, filename) {
595                if self.config.should_log(2) {
596                    debug!("Preserving file: {}", file_path.display());
597                }
598                return Ok(false);
599            }
600
601            match fs::remove_file(file_path) {
602                Ok(_) => {
603                    if self.config.should_log(2) {
604                        info!("Cleaned up file: {}", file_path.display());
605                    }
606                    Ok(true)
607                }
608                Err(e) => {
609                    error!("Failed to remove file {}: {}", file_path.display(), e);
610                    Err(CleanupError::Io(e))
611                }
612            }
613        }
614    }
615
616    /// Simplified whitelist check for files that don't need step-based filtering.
617    ///
618    /// # Arguments
619    ///
620    /// * `extension` - File extension without the dot
621    /// * `filename` - Just the filename
622    ///
623    /// # Returns
624    ///
625    /// Returns `true` if the file should be preserved, `false` otherwise
626    fn should_preserve_file_simple(&self, extension: &str, filename: &str) -> bool {
627        // Always preserve essential file types
628        if extension == "out" || extension == "log" || extension == "in" || extension == "inp" {
629            return true;
630        }
631
632        // Preserve input.inp (even without .inp extension check)
633        if filename == "input.inp" {
634            return true;
635        }
636
637        // Whitelist check: preserve if extension is in our list
638        if self
639            .config
640            .preserve_extensions
641            .iter()
642            .any(|ext| ext == extension)
643        {
644            return true;
645        }
646
647        // Not in whitelist, should be deleted
648        false
649    }
650
651    /// Gets the cleanup configuration (read-only).
652    pub fn config(&self) -> &CleanupConfig {
653        &self.config
654    }
655
656    /// Checks if a file would be preserved without actually cleaning it
657    ///
658    /// Note: For .engrad files, this returns false (use cleanup_directory for step-based filtering)
659    ///
660    /// # Arguments
661    ///
662    /// * `file_path` - Path to check
663    ///
664    /// # Returns
665    ///
666    /// Returns `true` if the file would be preserved (is in whitelist)
667    pub fn would_preserve(&self, file_path: &Path) -> bool {
668        if !file_path.exists() || file_path.is_dir() {
669            return false;
670        }
671
672        let filename = file_path.file_name().and_then(|s| s.to_str()).unwrap_or("");
673
674        let extension = file_path.extension().and_then(|s| s.to_str()).unwrap_or("");
675
676        // For .engrad files, conservatively return false
677        if extension == "engrad" {
678            return false;
679        }
680
681        self.should_preserve_file_simple(extension, filename)
682    }
683}