omecp/cleanup.rs
1//! Automated file cleanup for quantum chemistry calculations.
2//!
3//! This module provides functionality to automatically clean up temporary files
4//! generated during quantum chemistry calculations, particularly for ORCA calculations.
5//! The cleanup system uses a smart approach to prevent bus errors from excessive
6//! temporary files while preserving all essential files.
7//!
8//! # Philosophy: Smart File Management
9//!
10//! This implementation uses an intelligent file management strategy:
11//! - **Always keep**: Output files (.out, .log) and input files (.in, .inp)
12//! - **Latest only**: Energy/gradient files (.engrad) - only from the most recent step
13//! - **Configurable**: User-specified extensions from omecp_config.cfg
14//! - **Delete everything else**: All temporary and intermediate files
15//!
16//! This prevents bus errors from accumulating thousands of temporary files
17//! during long MECP optimization runs.
18//!
19//! # Features
20//!
21//! - **Automatic cleanup** after each QM calculation completes
22//! - **Step-based .engrad filtering** - keeps only the latest .engrad files
23//! - **Whitelist preservation** - preserves essential output and input files
24//! - **Configurable via omecp_config.cfg** for output file extensions
25//! - **Program-specific handling** (ORCA, Gaussian, XTB, etc.)
26//! - **Comprehensive logging** of all cleanup operations
27//! - **Safe operations** with proper error handling
28//!
29//! # Configuration
30//!
31//! The cleanup behavior is controlled via omecp_config.cfg. Add the cleanup
32//! section to your omecp_config.cfg file:
33//!
34//! ```ini
35//! [cleanup]
36//! # Enable or disable automatic cleanup (default: true)
37//! enabled = true
38//!
39//! # Verbose logging for cleanup operations (default: 1)
40//! # 0 = quiet, 1 = normal, 2 = verbose
41//! verbose = 1
42//!
43//! # Additional file extensions to preserve (comma-separated)
44//! # preserve_extensions = gbw,tmp,backup
45//! ```
46//!
47//! Note: Output file extension is controlled by the `[extensions]` section:
48//! ```ini
49//! [extensions]
50//! orca = out # All .out files will be preserved
51//! ```
52//!
53//! # File Preservation Strategy
54//!
55//! ## Files Always Preserved (Never Deleted)
56//! - **Output files** (.out, .log, etc.) - Calculation results
57//! - **Input files** (.in, .inp) - Input files for calculations
58//!
59//! ## Files Latest Only (Step-Based Filtering)
60//! - **Energy/gradient files** (.engrad) - Only from the most recent optimization step
61//! - Format: `{N}_state_{A|B}.engrad` where N is the step number
62//! - Keeps: Files with maximum step number (e.g., `60_state_A.engrad`)
63//! - Deletes: All other .engrad files (e.g., `59_state_*.engrad`, `58_state_*.engrad`, etc.)
64//!
65//! ## User-Configurable Extensions
66//! - Additional file extensions specified in `omecp_config.cfg` under `[cleanup]` section
67//! - All files with these extensions are preserved
68//!
69//! ## Files Always Deleted
70//! - SCF iteration files (.scf)
71//! - Temporary files (.tmp, .trash)
72//! - Lock files (.lock)
73//! - Old .engrad files (older than the latest step)
74//! - Any other file types not in the whitelist
75//!
76//! # Usage Example
77//!
78//! ```rust
79//! use omecp::cleanup::{CleanupManager, CleanupConfig};
80//! use omecp::settings::SettingsManager;
81//! use omecp::config::QMProgram;
82//! use std::path::Path;
83//!
84//! let settings_manager = SettingsManager::load()?;
85//! let program = QMProgram::Orca;
86//! let cleanup_config = CleanupConfig::from_settings_manager(&settings_manager, program);
87//!
88//! let manager = CleanupManager::new(cleanup_config, program);
89//! // Clean up files in the job directory
90//! manager.cleanup_directory(Path::new("compound_x"))?;
91//! ```
92//!
93//! # Error Handling
94//!
95//! All cleanup operations return proper `Result` types and log errors
96//! without panicking. This ensures that cleanup failures don't interrupt
97//! the main calculation workflow.
98
99use crate::config::QMProgram;
100use log::{debug, error, info, warn};
101use regex::Regex;
102use std::fs;
103use std::path::Path;
104use thiserror::Error;
105
106/// Errors that can occur during cleanup operations.
107#[derive(Error, Debug)]
108pub enum CleanupError {
109 /// I/O error during file operations
110 #[error("IO error: {0}")]
111 Io(#[from] std::io::Error),
112
113 /// Invalid path error
114 #[error("Invalid path: {0}")]
115 InvalidPath(String),
116}
117
118/// Result type for cleanup operations
119pub type Result<T> = std::result::Result<T, CleanupError>;
120
121/// Configuration for cleanup operations.
122#[derive(Debug, Clone)]
123pub struct CleanupConfig {
124 /// Enable automatic cleanup
125 pub enabled: bool,
126
127 /// File extensions to preserve (whitelist)
128 pub preserve_extensions: Vec<String>,
129
130 /// Verbosity level for cleanup logging
131 pub verbose: u32,
132
133 /// Perform cleanup every N optimization steps (default: 5)
134 /// Set to 0 to disable periodic cleanup
135 pub cleanup_frequency: u32,
136
137 /// Global print level from general settings (0=quiet, 1=normal, 2=verbose)
138 pub print_level: u32,
139}
140
141impl Default for CleanupConfig {
142 fn default() -> Self {
143 Self {
144 enabled: true,
145 preserve_extensions: Vec::new(),
146 verbose: 1,
147 cleanup_frequency: 5,
148 print_level: 0,
149 }
150 }
151}
152
153impl CleanupConfig {
154 /// Creates a cleanup configuration from settings manager.
155 ///
156 /// This integrates with omecp_config.cfg to get the user-specified output
157 /// extension for the QM program and adds it to the whitelist.
158 ///
159 /// # Arguments
160 ///
161 /// * `settings_manager` - Settings manager containing configuration
162 /// * `program` - QM program type
163 ///
164 /// # Returns
165 ///
166 /// Returns a CleanupConfig with the whitelist of files to preserve
167 pub fn from_settings_manager(
168 settings_manager: &crate::settings::SettingsManager,
169 program: QMProgram,
170 ) -> Self {
171 let settings = &settings_manager.settings();
172
173 // Get base config from settings
174 let mut config = CleanupConfig {
175 enabled: settings.cleanup.enabled,
176 preserve_extensions: settings.cleanup.preserve_extensions.clone(),
177 verbose: settings.cleanup.verbose,
178 cleanup_frequency: settings.cleanup.cleanup_frequency,
179 print_level: settings.general.print_level,
180 };
181
182 // Get user-specified output extension for this program
183 let user_ext = settings_manager.get_output_extension(program);
184
185 // Add user-specified output extension to whitelist (always preserve it)
186 if !user_ext.is_empty() && !config.preserve_extensions.iter().any(|s| s == user_ext) {
187 config.preserve_extensions.push(user_ext.to_string());
188 }
189
190 // Add program-specific essential files to whitelist
191 // Note: .engrad is handled separately with step-based filtering
192 let essential_extensions = match program {
193 // Gaussian: only needs output extension
194 QMProgram::Gaussian => vec![],
195 // ORCA: needs .gbw in addition to output (engrad is step-filtered)
196 QMProgram::Orca => vec!["gbw".to_string()],
197 // XTB: only needs output extension
198 QMProgram::Xtb => vec![],
199 // BAGEL: only needs output extension
200 QMProgram::Bagel => vec![],
201 // Custom: only needs output extension
202 QMProgram::Custom => vec![],
203 };
204
205 // Add essential extensions to whitelist
206 for ext in essential_extensions {
207 if !config.preserve_extensions.contains(&ext) {
208 config.preserve_extensions.push(ext);
209 }
210 }
211
212 if config.verbose >= 2 {
213 info!("Cleanup configuration for {:?}:", program);
214 info!(" Enabled: {}", config.enabled);
215 info!(" User output extension: {}", user_ext);
216 info!(" Whitelist extensions: {:?}", config.preserve_extensions);
217 }
218
219 config
220 }
221
222 /// Gets the list of preserve extensions
223 pub fn get_preserve_extensions(&self) -> &[String] {
224 &self.preserve_extensions
225 }
226
227 /// Checks if cleanup is enabled
228 pub fn is_enabled(&self) -> bool {
229 self.enabled
230 }
231
232 /// Gets verbosity level
233 pub fn verbosity(&self) -> u32 {
234 self.verbose
235 }
236
237 /// Gets cleanup frequency (every N steps)
238 pub fn cleanup_frequency(&self) -> u32 {
239 self.cleanup_frequency
240 }
241
242 /// Checks if logging should occur based on print_level and verbose settings.
243 ///
244 /// This combines the global print_level with the cleanup-specific verbose setting:
245 /// - If print_level is 0 (quiet), no cleanup messages are printed regardless of verbose
246 /// - If print_level is 1 (normal), messages are printed based on verbose level
247 /// - If print_level is 2 (verbose), all messages are printed
248 ///
249 /// # Arguments
250 ///
251 /// * `min_verbose_level` - Minimum verbose level required (0, 1, or 2)
252 ///
253 /// # Returns
254 ///
255 /// Returns `true` if logging should occur, `false` otherwise
256 pub fn should_log(&self, min_verbose_level: u32) -> bool {
257 // If global print_level is 0 (quiet), suppress all cleanup output
258 if self.print_level == 0 {
259 return false;
260 }
261
262 // If global print_level is 2 (verbose), allow all messages
263 if self.print_level >= 2 {
264 return true;
265 }
266
267 // If global print_level is 1 (normal), check verbose level
268 // verbose = 0: quiet, verbose = 1: normal, verbose = 2: verbose
269 self.verbose >= min_verbose_level
270 }
271}
272
273/// Manages cleanup operations for quantum chemistry calculations.
274pub struct CleanupManager {
275 /// Cleanup configuration with whitelist
276 config: CleanupConfig,
277
278 /// QM program type
279 program: QMProgram,
280}
281
282impl CleanupManager {
283 /// Creates a new cleanup manager.
284 ///
285 /// # Arguments
286 ///
287 /// * `config` - Cleanup configuration with whitelist
288 /// * `program` - QM program type
289 ///
290 /// # Returns
291 ///
292 /// Returns a new CleanupManager instance
293 pub fn new(config: CleanupConfig, program: QMProgram) -> Self {
294 Self { config, program }
295 }
296
297 /// Cleans up temporary files in the specified directory.
298 ///
299 /// Uses a smart approach: preserves essential files and keeps only the
300 /// latest .engrad files to prevent bus errors from excessive files.
301 ///
302 /// # Arguments
303 ///
304 /// * `directory` - Path to the directory to clean (e.g., job directory from input file stem)
305 ///
306 /// # Returns
307 ///
308 /// Returns `Ok(())` on success or a `CleanupError` on failure
309 pub fn cleanup_directory(&self, directory: &Path) -> Result<()> {
310 if !self.config.enabled {
311 if self.config.should_log(1) {
312 info!(
313 "Cleanup is disabled, skipping directory: {}",
314 directory.display()
315 );
316 }
317 return Ok(());
318 }
319
320 if !directory.exists() {
321 if self.config.should_log(2) {
322 debug!(
323 "Directory does not exist, skipping: {}",
324 directory.display()
325 );
326 }
327 return Ok(());
328 }
329
330 if !directory.is_dir() {
331 return Err(CleanupError::InvalidPath(format!(
332 "Path is not a directory: {}",
333 directory.display()
334 )));
335 }
336
337 if self.config.should_log(2) {
338 info!("Starting cleanup in directory: {}", directory.display());
339 info!(
340 "Preserving files with extensions: {:?}",
341 self.config.preserve_extensions
342 );
343 }
344
345 // Read all directory entries
346 let entries = fs::read_dir(directory).map_err(CleanupError::Io)?;
347
348 let mut all_files = Vec::new();
349 for entry in entries {
350 match entry {
351 Ok(entry) => {
352 let path = entry.path();
353 let filename = path
354 .file_name()
355 .and_then(|s| s.to_str())
356 .unwrap_or("")
357 .to_string();
358
359 // Skip hidden files and directories
360 if !filename.starts_with('.') && !path.is_dir() {
361 all_files.push((path, filename));
362 }
363 }
364 Err(e) => {
365 warn!("Error reading directory entry: {}", e);
366 }
367 }
368 }
369
370 // Find the maximum step number from .inp files
371 let max_step = self.find_max_step_number(&all_files);
372
373 let mut cleaned_files = Vec::new();
374 let mut preserved_files = Vec::new();
375 let mut errors = Vec::new();
376
377 for (path, filename) in all_files {
378 let extension = path.extension().and_then(|s| s.to_str()).unwrap_or("");
379
380 // Check if this file should be preserved
381 if self.should_preserve_file(extension, &path, &filename, max_step) {
382 preserved_files.push(path.clone());
383 if self.config.should_log(2) {
384 debug!("Preserving file: {}", path.display());
385 }
386 } else {
387 // Delete the file
388 match fs::remove_file(&path) {
389 Ok(_) => {
390 cleaned_files.push(path.clone());
391 if self.config.should_log(2) {
392 info!("Cleaned up file: {}", path.display());
393 }
394 }
395 Err(e) => {
396 warn!("Failed to remove file {}: {}", path.display(), e);
397 errors.push((path.clone(), e));
398 }
399 }
400 }
401 }
402
403 if self.config.should_log(2) {
404 info!(
405 "Cleanup completed: {} files deleted, {} files preserved",
406 cleaned_files.len(),
407 preserved_files.len()
408 );
409 }
410 if self.config.should_log(1) && max_step > 0 {
411 debug!("Latest step number: {}", max_step);
412 }
413
414 if !errors.is_empty() {
415 error!("Cleanup completed with {} errors", errors.len());
416 }
417
418 Ok(())
419 }
420
421 /// Finds the maximum step number from .inp files in the directory.
422 ///
423 /// Scans for files matching the pattern `{N}_state_{A|B}.inp` and returns
424 /// the maximum step number N found.
425 ///
426 /// # Arguments
427 ///
428 /// * `files` - List of (path, filename) tuples
429 ///
430 /// # Returns
431 ///
432 /// Returns the maximum step number found, or 0 if no step-based files exist
433 fn find_max_step_number(&self, files: &[(std::path::PathBuf, String)]) -> usize {
434 let inp_regex = Regex::new(r"^(\d+)_state_[AB]\.inp$").unwrap();
435 let mut max_step = 0;
436
437 for (_, filename) in files {
438 if let Some(caps) = inp_regex.captures(filename) {
439 if let Ok(step) = caps[1].parse::<usize>() {
440 if step > max_step {
441 max_step = step;
442 }
443 }
444 }
445 }
446
447 max_step
448 }
449
450 /// Extracts the step number from a .engrad filename.
451 ///
452 /// Parses files matching the pattern `{N}_state_{A|B}.engrad` and returns
453 /// the step number N.
454 ///
455 /// # Arguments
456 ///
457 /// * `filename` - The filename to parse
458 ///
459 /// # Returns
460 ///
461 /// Returns Some(step_number) if the filename matches the pattern, None otherwise
462 fn extract_step_from_engrad(&self, filename: &str) -> Option<usize> {
463 let engrad_regex = Regex::new(r"^(\d+)_state_[AB]\.engrad$").unwrap();
464 engrad_regex
465 .captures(filename)
466 .and_then(|caps| caps[1].parse::<usize>().ok())
467 }
468
469 /// Determines if a file should be preserved (whitelist check with step-based filtering).
470 ///
471 /// Files are preserved based on:
472 /// 1. Extension in whitelist (always keep)
473 /// 2. Special filename patterns (always keep)
474 /// 3. .engrad files from the latest step (keep only)
475 /// 4. All other files (delete)
476 ///
477 /// # Arguments
478 ///
479 /// * `extension` - File extension without the dot
480 /// * `path` - Full file path
481 /// * `filename` - Just the filename
482 /// * `max_step` - Maximum step number from .inp files
483 ///
484 /// # Returns
485 ///
486 /// Returns `true` if the file should be preserved, `false` otherwise
487 pub fn should_preserve_file(
488 &self,
489 extension: &str,
490 _path: &Path,
491 filename: &str,
492 max_step: usize,
493 ) -> bool {
494 // Always preserve essential file types
495 if extension == "out" || extension == "log" || extension == "in" || extension == "inp" {
496 return true;
497 }
498
499 // Preserve input.inp (even without .inp extension check)
500 if filename == "input.inp" {
501 return true;
502 }
503
504 // Special handling for .engrad files - only keep from the latest step
505 if extension == "engrad" {
506 if let Some(step) = self.extract_step_from_engrad(filename) {
507 // Keep only .engrad files from the maximum step
508 return step == max_step;
509 }
510 // .engrad files that don't match the pattern should be deleted
511 return false;
512 }
513
514 // Whitelist check: preserve if extension is in our list
515 if self
516 .config
517 .preserve_extensions
518 .iter()
519 .any(|ext| ext == extension)
520 {
521 return true;
522 }
523
524 // Program-specific preservation of special files
525 match self.program {
526 QMProgram::Orca => {
527 // ORCA checkpoint and state files (.gbw) - already in whitelist
528 }
529
530 QMProgram::Gaussian => {
531 // Gaussian checkpoint files (.chk) - already in whitelist
532 }
533
534 _ => {
535 // Other programs
536 }
537 }
538
539 // Not in whitelist, should be deleted
540 false
541 }
542
543 /// Cleans up a single file if it's not in the whitelist.
544 ///
545 /// Note: For .engrad files, this method conservatively deletes them unless
546 /// they are from the latest step. To use step-based filtering, use
547 /// cleanup_directory() instead.
548 ///
549 /// # Arguments
550 ///
551 /// * `file_path` - Path to the file to clean
552 ///
553 /// # Returns
554 ///
555 /// Returns `Ok(true)` if file was deleted, `Ok(false)` if preserved
556 pub fn cleanup_file(&self, file_path: &Path) -> Result<bool> {
557 if !self.config.enabled {
558 return Ok(false);
559 }
560
561 if !file_path.exists() || file_path.is_dir() {
562 return Ok(false);
563 }
564
565 let filename = file_path.file_name().and_then(|s| s.to_str()).unwrap_or("");
566
567 let extension = file_path.extension().and_then(|s| s.to_str()).unwrap_or("");
568
569 // For .engrad files, conservatively delete them unless we can determine
570 // they should be preserved (use cleanup_directory for step-based filtering)
571 if extension == "engrad" {
572 // .engrad files are deleted by default (not in preserve_extensions)
573 // unless cleanup_directory determines they should be kept
574 if self.config.should_log(2) {
575 debug!(
576 "Deleting .engrad file (use cleanup_directory for step-based filtering): {}",
577 file_path.display()
578 );
579 }
580 match fs::remove_file(file_path) {
581 Ok(_) => {
582 if self.config.should_log(2) {
583 info!("Cleaned up file: {}", file_path.display());
584 }
585 Ok(true)
586 }
587 Err(e) => {
588 error!("Failed to remove file {}: {}", file_path.display(), e);
589 Err(CleanupError::Io(e))
590 }
591 }
592 } else {
593 // For non-.engrad files, use normal whitelist check
594 if self.should_preserve_file_simple(extension, filename) {
595 if self.config.should_log(2) {
596 debug!("Preserving file: {}", file_path.display());
597 }
598 return Ok(false);
599 }
600
601 match fs::remove_file(file_path) {
602 Ok(_) => {
603 if self.config.should_log(2) {
604 info!("Cleaned up file: {}", file_path.display());
605 }
606 Ok(true)
607 }
608 Err(e) => {
609 error!("Failed to remove file {}: {}", file_path.display(), e);
610 Err(CleanupError::Io(e))
611 }
612 }
613 }
614 }
615
616 /// Simplified whitelist check for files that don't need step-based filtering.
617 ///
618 /// # Arguments
619 ///
620 /// * `extension` - File extension without the dot
621 /// * `filename` - Just the filename
622 ///
623 /// # Returns
624 ///
625 /// Returns `true` if the file should be preserved, `false` otherwise
626 fn should_preserve_file_simple(&self, extension: &str, filename: &str) -> bool {
627 // Always preserve essential file types
628 if extension == "out" || extension == "log" || extension == "in" || extension == "inp" {
629 return true;
630 }
631
632 // Preserve input.inp (even without .inp extension check)
633 if filename == "input.inp" {
634 return true;
635 }
636
637 // Whitelist check: preserve if extension is in our list
638 if self
639 .config
640 .preserve_extensions
641 .iter()
642 .any(|ext| ext == extension)
643 {
644 return true;
645 }
646
647 // Not in whitelist, should be deleted
648 false
649 }
650
651 /// Gets the cleanup configuration (read-only).
652 pub fn config(&self) -> &CleanupConfig {
653 &self.config
654 }
655
656 /// Checks if a file would be preserved without actually cleaning it
657 ///
658 /// Note: For .engrad files, this returns false (use cleanup_directory for step-based filtering)
659 ///
660 /// # Arguments
661 ///
662 /// * `file_path` - Path to check
663 ///
664 /// # Returns
665 ///
666 /// Returns `true` if the file would be preserved (is in whitelist)
667 pub fn would_preserve(&self, file_path: &Path) -> bool {
668 if !file_path.exists() || file_path.is_dir() {
669 return false;
670 }
671
672 let filename = file_path.file_name().and_then(|s| s.to_str()).unwrap_or("");
673
674 let extension = file_path.extension().and_then(|s| s.to_str()).unwrap_or("");
675
676 // For .engrad files, conservatively return false
677 if extension == "engrad" {
678 return false;
679 }
680
681 self.should_preserve_file_simple(extension, filename)
682 }
683}