omecp/
parser.rs

1//! Input file parsing for OpenMECP configuration.
2//!
3//! This module provides parsing functionality for OpenMECP input files, which
4//! use a custom section-based format. The parser handles:
5//!
6//! - **Geometries**: Main geometry and LST interpolation geometries
7//! - **Configuration**: All calculation parameters (method, program, etc.)
8//! - **Constraints**: Bond lengths, angles, and fixed atoms
9//! - **Tail sections**: Additional keywords for each state
10//!
11//! # Input File Format
12//!
13//! OpenMECP input files use a section-based syntax marked with `*SECTION` markers
14//! and terminated with `*`. The format is designed to be human-readable and
15//! flexible for different types of calculations.
16//!
17//! ## Required Sections
18//!
19//! ### *GEOM Section
20//!
21//! Contains the initial molecular geometry in Cartesian coordinates:
22//!
23//! ```text
24//! *GEOM
25//! C  0.0  0.0  0.0
26//! H  1.0  0.0  0.0
27//! H  0.0  1.0  0.0
28//! *
29//! ```
30//!
31//! Can also reference external files:
32//!
33//! ```text
34//! *GEOM
35//! @geometry.xyz
36//! *
37//! ```
38//!
39//! ### Key-Value Parameters
40//!
41//! After sections, specify calculation parameters:
42//!
43//! ```text
44//! program = gaussian
45//! method = B3LYP/6-31G*
46//! nprocs = 4
47//! mem = 4GB
48//! charge = 0
49//! mult_state_a = 1  # or mult_a = 1
50//! mult_state_b = 3  # or mult_b = 3
51//! ```
52//!
53//! ## Optional Sections
54//!
55//! - `*TAIL_a` / `*TAIL_b`: Additional keywords for each electronic state
56//! - `*CONSTR`: Geometric constraints (bonds, angles, scans)
57//! - `*LST_a` / `*LST_b`: Geometries for linear synchronous transit interpolation
58//!
59//! # Examples
60//!
61//! ```
62//! use omecp::parser::{parse_input, InputData};
63//! use std::path::Path;
64//!
65//! // Parse an input file
66//! let input_path = Path::new("input.inp");
67//! let input_data: InputData = parse_input(input_path)?;
68//!
69//! // Access parsed data
70//! let config = input_data.config;
71//! let geometry = input_data.geometry;
72//! let constraints = input_data.constraints;
73//! ```
74
75use crate::config::{Config, HessianMethod, QMProgram, RunMode, ScanSpec, ScanType};
76use crate::constraints::Constraint;
77use crate::geometry::Geometry;
78use regex::Regex;
79use std::fs;
80use std::path::Path;
81use thiserror::Error;
82
83/// Error type for parsing operations.
84///
85/// Parsing can fail due to:
86/// - File I/O errors (missing files, permission issues)
87/// - Format errors (malformed input, invalid values)
88/// - Unsupported file formats
89/// - Invalid tail section content
90/// - Gaussian syntax errors
91#[derive(Error, Debug)]
92pub enum ParseError {
93    /// I/O error when reading files
94    #[error("IO error: {0}")]
95    Io(#[from] std::io::Error),
96    /// Parse error with descriptive message
97    #[error("Parse error: {0}")]
98    Parse(String),
99    /// Invalid tail section content with detailed context
100    #[error("Invalid tail section '{section}'{}: {message}", line_number.map(|n| format!(" at line {}", n)).unwrap_or_default())]
101    InvalidTailSection {
102        /// Name of the tail section (e.g., "TAIL_a", "TAIL_b")
103        section: String,
104        /// Descriptive error message
105        message: String,
106        /// Optional line number where the error occurred
107        line_number: Option<usize>,
108    },
109    /// Gaussian syntax error with detailed context
110    #[error("Gaussian syntax error in {section}{}: {details}", line_number.map(|n| format!(" at line {}", n)).unwrap_or_default())]
111    GaussianSyntaxError {
112        /// Name of the section where the error occurred
113        section: String,
114        /// Detailed error information
115        details: String,
116        /// Optional line number where the error occurred
117        line_number: Option<usize>,
118    },
119}
120
121/// Type alias for parse operation results
122type Result<T> = std::result::Result<T, ParseError>;
123
124/// Complete parsed input data from an OpenMECP input file.
125///
126/// This struct contains all information parsed from an input file, organized
127/// into logical components:
128///
129/// # Examples
130///
131/// ```
132/// use omecp::parser::InputData;
133/// use omecp::config::Config;
134/// use omecp::geometry::Geometry;
135/// use omecp::constraints::Constraint;
136///
137/// // Access parsed components
138/// let input_data: InputData = /* ... parsed from file ... */;
139/// let config = &input_data.config;
140/// let geometry = &input_data.geometry;
141/// let constraints = &input_data.constraints;
142/// let tail_a = &input_data.tail_a;
143/// let tail_b = &input_data.tail_b;
144/// ```
145pub struct InputData {
146    /// Complete calculation configuration
147    pub config: Config,
148    /// Initial molecular geometry
149    pub geometry: Geometry,
150    /// List of geometric constraints
151    pub constraints: Vec<Constraint>,
152    /// Additional keywords for electronic state 1
153    pub tail_a: String,
154    /// Additional keywords for electronic state 2
155    pub tail_b: String,
156    /// List of fixed atom indices (0-based)
157    pub fixed_atoms: Vec<usize>,
158    /// Optional geometry for LST interpolation (first endpoint)
159    pub lst_a: Option<Geometry>,
160    /// Optional geometry for LST interpolation (second endpoint)
161    pub lst_b: Option<Geometry>,
162}
163
164/// Parse an OpenMECP input file.
165///
166/// This function reads and parses a complete OpenMECP input file, extracting
167/// all configuration parameters, geometries, constraints, and additional
168/// keywords. It supports the custom section-based input format with support
169/// for external geometry files.
170///
171/// # Arguments
172///
173/// * `path` - Path to the input file (e.g., "input.inp")
174///
175/// # Returns
176///
177/// Returns `Ok(InputData)` on successful parsing, or `Err(ParseError)` if:
178/// - The file cannot be read (I/O error)
179/// - The file format is invalid
180/// - Required sections are missing
181/// - External geometry files cannot be read
182///
183/// # Examples
184///
185/// Basic usage:
186///
187/// ```
188/// use omecp::parser::parse_input;
189/// use std::path::Path;
190///
191/// let input_path = Path::new("input.inp");
192/// match parse_input(input_path) {
193///     Ok(input_data) => {
194///         println!("Parsed {} atoms", input_data.geometry.num_atoms);
195///         println!("QM program: {:?}", input_data.config.program);
196///     }
197///     Err(e) => eprintln!("Parse error: {}", e),
198/// }
199/// ```
200///
201/// Parsing with external geometry file:
202///
203/// ```text
204/// // input.inp
205/// *GEOM
206/// @molecule.xyz
207/// *
208///
209/// program = gaussian
210/// method = B3LYP/6-31G*
211/// // ... other parameters ...
212/// ```
213///
214/// ```
215/// use omecp::parser::parse_input;
216/// use std::path::Path;
217///
218/// let input_data = parse_input(Path::new("input.inp"))?;
219/// // Geometry will be read from molecule.xyz
220/// ```
221pub fn parse_input(path: &Path) -> Result<InputData> {
222    let content: String = fs::read_to_string(path)?;
223    let mut config: Config = Config::default();
224    let mut elements: Vec<String> = Vec::new();
225    let mut coords: Vec<f64> = Vec::new();
226    let mut constraints: Vec<Constraint> = Vec::new();
227    let mut tail_a: String = String::new();
228    let mut tail_b: String = String::new();
229    let mut fixed_atoms: Vec<usize> = Vec::new();
230    let mut lst_a_elements: Vec<String> = Vec::new();
231    let mut lst_a_coords: Vec<f64> = Vec::new();
232    let mut lst_b_elements: Vec<String> = Vec::new();
233    let mut lst_b_coords: Vec<f64> = Vec::new();
234    let mut oniom_layer_info: Vec<String> = Vec::new();
235
236    let mut in_geom: bool = false;
237    let mut in_tail_a: bool = false;
238    let mut in_tail_b: bool = false;
239    let mut in_constr: bool = false;
240    let mut in_lst_a: bool = false;
241    let mut in_lst_b: bool = false;
242
243    let geom_re: Regex = Regex::new(r"^\s*(\S+)\s+(-?\d+\.?\d*)").unwrap();
244
245    for line in content.lines() {
246        let line_lower: String = line.to_lowercase();
247        let trimmed: &str = line_lower.trim();
248
249        if trimmed.starts_with('#') {
250            continue;
251        }
252
253        if trimmed.contains("*geom") {
254            in_geom = true;
255            continue;
256        } else if trimmed.contains("*tail_a") {
257            in_tail_a = true;
258            continue;
259        } else if trimmed.contains("*tail_b") {
260            in_tail_b = true;
261            continue;
262        } else if trimmed.contains("*constr") {
263            in_constr = true;
264            continue;
265        } else if trimmed.contains("*lst_a") {
266            in_lst_a = true;
267            continue;
268        } else if trimmed.contains("*lst_b") {
269            in_lst_b = true;
270            continue;
271        } else if trimmed == "*" {
272            in_geom = false;
273            in_tail_a = false;
274            in_tail_b = false;
275            in_constr = false;
276            in_lst_a = false;
277            in_lst_b = false;
278            continue;
279        }
280
281        if in_geom {
282            if line.trim().starts_with('@') {
283                // External geometry file
284                let filename: &str = line.trim().strip_prefix('@').unwrap().trim();
285                let external_path: &Path = Path::new(filename);
286                let (ext_elements, ext_coords) = read_external_geometry(external_path)?;
287                elements = ext_elements;
288                coords = ext_coords;
289            } else if geom_re.is_match(line) {
290                let parts: Vec<&str> = line.split_whitespace().collect();
291                if parts.len() >= 4 {
292                    elements.push(parts[0].to_string());
293                    coords.push(
294                        parts[1]
295                            .parse()
296                            .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
297                    );
298                    coords.push(
299                        parts[2]
300                            .parse()
301                            .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
302                    );
303                    coords.push(
304                        parts[3]
305                            .parse()
306                            .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
307                    );
308                    if parts.len() > 4 {
309                        oniom_layer_info.push(parts[4..].join(" "));
310                    } else {
311                        oniom_layer_info.push(String::new());
312                    }
313                }
314            }
315        } else if in_lst_a && geom_re.is_match(line) {
316            let parts: Vec<&str> = line.split_whitespace().collect();
317            if parts.len() >= 4 {
318                lst_a_elements.push(parts[0].to_string());
319                lst_a_coords.push(
320                    parts[1]
321                        .parse()
322                        .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
323                );
324                lst_a_coords.push(
325                    parts[2]
326                        .parse()
327                        .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
328                );
329                lst_a_coords.push(
330                    parts[3]
331                        .parse()
332                        .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
333                );
334            }
335        } else if in_lst_b && geom_re.is_match(line) {
336            let parts: Vec<&str> = line.split_whitespace().collect();
337            if parts.len() >= 4 {
338                lst_b_elements.push(parts[0].to_string());
339                lst_b_coords.push(
340                    parts[1]
341                        .parse()
342                        .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
343                );
344                lst_b_coords.push(
345                    parts[2]
346                        .parse()
347                        .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
348                );
349                lst_b_coords.push(
350                    parts[3]
351                        .parse()
352                        .map_err(|_| ParseError::Parse("Invalid coordinate".into()))?,
353                );
354            }
355        } else if in_tail_a {
356            if !is_comment_line(line) && !line.trim().is_empty() {
357                if !tail_a.is_empty() {
358                    tail_a.push(' '); // Add space separator between keywords
359                }
360                tail_a.push_str(line.trim());
361            }
362        } else if in_tail_b {
363            if !is_comment_line(line) && !line.trim().is_empty() {
364                if !tail_b.is_empty() {
365                    tail_b.push(' '); // Add space separator between keywords
366                }
367                tail_b.push_str(line.trim());
368            }
369        } else if in_constr && !trimmed.is_empty() {
370            if trimmed.starts_with('s') {
371                parse_scan(line, &mut config)?;
372            } else {
373                parse_constraint(line, &mut constraints)?;
374            }
375        } else if trimmed.contains('=') {
376            parse_parameter(line, &mut config, &mut fixed_atoms)?;
377        }
378    }
379
380    config.oniom_layer_info = oniom_layer_info;
381
382    // Validate tail sections after parsing with enhanced error reporting
383    // Provide context about filtering if sections became empty
384    validate_tail_section_with_filtering_context(&tail_a, "TAIL_a")?;
385    validate_tail_section_with_filtering_context(&tail_b, "TAIL_b")?;
386
387    // Coordinates are stored directly in Angstrom (no conversion needed)
388    let geometry = Geometry::new(elements, coords);
389    
390    let lst_a = if !lst_a_elements.is_empty() {
391        // LST geometry coordinates are also stored in Angstrom
392        Some(Geometry::new(lst_a_elements, lst_a_coords))
393    } else {
394        None
395    };
396    let lst_b = if !lst_b_elements.is_empty() {
397        // LST geometry coordinates are also stored in Angstrom
398        Some(Geometry::new(lst_b_elements, lst_b_coords))
399    } else {
400        None
401    };
402
403    // Validate constraints against the parsed geometry
404    validate_constraints_against_geometry(&constraints, &geometry)?;
405
406    Ok(InputData {
407        config,
408        geometry,
409        constraints,
410        tail_a,
411        tail_b,
412        fixed_atoms,
413        lst_a,
414        lst_b,
415    })
416}
417
418/// Parses a single constraint line from the *CONSTR section.
419///
420/// Supports the following constraint types:
421/// - `r atom1 atom2 target_distance` - Bond length constraint
422/// - `a atom1 atom2 atom3 target_angle` - Bond angle constraint (degrees)
423/// - `d atom1 atom2 atom3 atom4 target_dihedral` - Dihedral angle constraint (degrees)
424///
425/// # Arguments
426///
427/// * `line` - The constraint line to parse
428/// * `constraints` - Mutable vector to add parsed constraints to
429///
430/// # Returns
431///
432/// `Ok(())` if parsing succeeds, `Err(ParseError)` with detailed error information if parsing fails.
433///
434/// # Examples
435///
436/// ```
437/// // Bond constraint: keep atoms 1-2 at 1.5 Angstrom
438/// r 1 2 1.5
439///
440/// // Angle constraint: keep angle 1-2-3 at 120 degrees
441/// a 1 2 3 120.0
442///
443/// // Dihedral constraint: keep dihedral 1-2-3-4 at 180 degrees
444/// d 1 2 3 4 180.0
445/// ```
446fn parse_constraint(line: &str, constraints: &mut Vec<Constraint>) -> Result<()> {
447    let trimmed = line.trim();
448    if trimmed.is_empty() || trimmed.starts_with('#') {
449        return Ok(());
450    }
451
452    let parts: Vec<&str> = trimmed.split_whitespace().collect();
453    if parts.is_empty() {
454        return Ok(());
455    }
456
457    let constraint_type = parts[0].to_lowercase();
458
459    match constraint_type.as_str() {
460        "r" | "bond" => {
461            if parts.len() < 4 {
462                return Err(ParseError::Parse(format!(
463                    "Bond constraint requires 4 parameters: 'r atom1 atom2 target_distance'. Got: '{}'",
464                    line
465                )));
466            }
467
468            let a = parse_atom_index(parts[1], "first atom", line)?;
469            let b = parse_atom_index(parts[2], "second atom", line)?;
470            let target = parse_distance_target(parts[3], line)?;
471
472            if a == b {
473                return Err(ParseError::Parse(format!(
474                    "Bond constraint cannot have the same atom twice: atom {} in line '{}'",
475                    a + 1,
476                    line
477                )));
478            }
479
480            constraints.push(Constraint::Bond {
481                atoms: (a, b),
482                target,
483            });
484        }
485        "a" | "angle" => {
486            if parts.len() < 5 {
487                return Err(ParseError::Parse(format!(
488                    "Angle constraint requires 5 parameters: 'a atom1 atom2 atom3 target_angle'. Got: '{}'",
489                    line
490                )));
491            }
492
493            let a = parse_atom_index(parts[1], "first atom", line)?;
494            let b = parse_atom_index(parts[2], "second atom", line)?;
495            let c = parse_atom_index(parts[3], "third atom", line)?;
496            let target_degrees = parse_angle_target(parts[4], line)?;
497            let target = target_degrees.to_radians();
498
499            // Validate atom indices are unique
500            if a == b || b == c || a == c {
501                return Err(ParseError::Parse(format!(
502                    "Angle constraint atoms must be unique. Got atoms {}, {}, {} in line '{}'",
503                    a + 1,
504                    b + 1,
505                    c + 1,
506                    line
507                )));
508            }
509
510            constraints.push(Constraint::Angle {
511                atoms: (a, b, c),
512                target,
513            });
514        }
515        "d" | "dihedral" => {
516            if parts.len() < 6 {
517                return Err(ParseError::Parse(format!(
518                    "Dihedral constraint requires 6 parameters: 'd atom1 atom2 atom3 atom4 target_dihedral'. Got: '{}'",
519                    line
520                )));
521            }
522
523            let a = parse_atom_index(parts[1], "first atom", line)?;
524            let b = parse_atom_index(parts[2], "second atom", line)?;
525            let c = parse_atom_index(parts[3], "third atom", line)?;
526            let d = parse_atom_index(parts[4], "fourth atom", line)?;
527            let target_degrees = parse_angle_target(parts[5], line)?;
528            let target = target_degrees.to_radians();
529
530            // Validate atom indices are unique
531            let atoms = [a, b, c, d];
532            for i in 0..4 {
533                for j in (i + 1)..4 {
534                    if atoms[i] == atoms[j] {
535                        return Err(ParseError::Parse(format!(
536                            "Dihedral constraint atoms must be unique. Got atoms {}, {}, {}, {} in line '{}'",
537                            a + 1, b + 1, c + 1, d + 1, line
538                        )));
539                    }
540                }
541            }
542
543            constraints.push(Constraint::Dihedral {
544                atoms: (a, b, c, d),
545                target,
546            });
547        }
548        _ => {
549            return Err(ParseError::Parse(format!(
550                "Unknown constraint type '{}'. Supported types: 'r' (bond), 'a' (angle), 'd' (dihedral). Line: '{}'",
551                constraint_type, line
552            )));
553        }
554    }
555    Ok(())
556}
557
558/// Parses an atom index from a string with detailed error reporting.
559fn parse_atom_index(s: &str, atom_description: &str, full_line: &str) -> Result<usize> {
560    let index = s.parse::<usize>()
561        .map_err(|_| ParseError::Parse(format!(
562            "Invalid {} index '{}' in constraint line '{}'. Atom indices must be positive integers.",
563            atom_description, s, full_line
564        )))?;
565
566    if index == 0 {
567        return Err(ParseError::Parse(format!(
568            "Atom indices must be 1-based (starting from 1), got {} for {} in line '{}'",
569            index, atom_description, full_line
570        )));
571    }
572
573    Ok(index - 1) // Convert to 0-based indexing
574}
575
576/// Parses a distance target value with validation.
577fn parse_distance_target(s: &str, full_line: &str) -> Result<f64> {
578    let distance = s.parse::<f64>()
579        .map_err(|_| ParseError::Parse(format!(
580            "Invalid distance target '{}' in constraint line '{}'. Distance must be a positive number.",
581            s, full_line
582        )))?;
583
584    if distance <= 0.0 {
585        return Err(ParseError::Parse(format!(
586            "Distance target must be positive, got {} in line '{}'",
587            distance, full_line
588        )));
589    }
590
591    if distance > 20.0 {
592        return Err(ParseError::Parse(format!(
593            "Distance target {} Angstrom seems unreasonably large in line '{}'. Maximum allowed: 20.0 Angstrom",
594            distance, full_line
595        )));
596    }
597
598    Ok(distance)
599}
600
601/// Parses an angle target value (in degrees) with validation.
602fn parse_angle_target(s: &str, full_line: &str) -> Result<f64> {
603    let angle = s.parse::<f64>().map_err(|_| {
604        ParseError::Parse(format!(
605            "Invalid angle target '{}' in constraint line '{}'. Angle must be a number in degrees.",
606            s, full_line
607        ))
608    })?;
609
610    if !(0.0..=360.0).contains(&angle) {
611        return Err(ParseError::Parse(format!(
612            "Angle target {} degrees is outside valid range [0, 360] in line '{}'",
613            angle, full_line
614        )));
615    }
616
617    Ok(angle)
618}
619
620/// Validates parsed constraints against the molecular geometry.
621///
622/// This function performs comprehensive validation of constraints including:
623/// - Atom index bounds checking
624/// - Duplicate constraint detection
625/// - Geometric feasibility checks
626/// - Target value reasonableness
627///
628/// # Arguments
629///
630/// * `constraints` - List of parsed constraints to validate
631/// * `geometry` - Molecular geometry to validate against
632///
633/// # Returns
634///
635/// `Ok(())` if all constraints are valid, `Err(ParseError)` with detailed error information otherwise.
636fn validate_constraints_against_geometry(
637    constraints: &[Constraint],
638    geometry: &Geometry,
639) -> Result<()> {
640    let num_atoms = geometry.num_atoms;
641
642    // Validate each constraint
643    for (i, constraint) in constraints.iter().enumerate() {
644        match constraint {
645            Constraint::Bond {
646                atoms: (a, b),
647                target,
648            } => {
649                // Check atom indices
650                if *a >= num_atoms {
651                    return Err(ParseError::Parse(format!(
652                        "Bond constraint {}: atom index {} exceeds number of atoms ({})",
653                        i + 1,
654                        a + 1,
655                        num_atoms
656                    )));
657                }
658                if *b >= num_atoms {
659                    return Err(ParseError::Parse(format!(
660                        "Bond constraint {}: atom index {} exceeds number of atoms ({})",
661                        i + 1,
662                        b + 1,
663                        num_atoms
664                    )));
665                }
666
667                // Check target reasonableness
668                if *target > 10.0 {
669                    return Err(ParseError::Parse(format!(
670                        "Bond constraint {}: target distance {:.3} Angstrom is unreasonably large",
671                        i + 1,
672                        target
673                    )));
674                }
675            }
676            Constraint::Angle {
677                atoms: (a, b, c),
678                target: _,
679            } => {
680                // Check atom indices
681                for (atom_idx, atom_name) in [(*a, "first"), (*b, "second"), (*c, "third")] {
682                    if atom_idx >= num_atoms {
683                        return Err(ParseError::Parse(format!(
684                            "Angle constraint {}: {} atom index {} exceeds number of atoms ({})",
685                            i + 1,
686                            atom_name,
687                            atom_idx + 1,
688                            num_atoms
689                        )));
690                    }
691                }
692            }
693            Constraint::Dihedral {
694                atoms: (a, b, c, d),
695                target: _,
696            } => {
697                // Check atom indices
698                for (atom_idx, atom_name) in
699                    [(*a, "first"), (*b, "second"), (*c, "third"), (*d, "fourth")]
700                {
701                    if atom_idx >= num_atoms {
702                        return Err(ParseError::Parse(format!(
703                            "Dihedral constraint {}: {} atom index {} exceeds number of atoms ({})",
704                            i + 1,
705                            atom_name,
706                            atom_idx + 1,
707                            num_atoms
708                        )));
709                    }
710                }
711            }
712        }
713    }
714
715    // Check for duplicate constraints
716    for i in 0..constraints.len() {
717        for j in (i + 1)..constraints.len() {
718            if constraints_are_equivalent(&constraints[i], &constraints[j]) {
719                return Err(ParseError::Parse(format!(
720                    "Duplicate constraints found: constraint {} and {} define the same geometric parameter",
721                    i + 1, j + 1
722                )));
723            }
724        }
725    }
726
727    Ok(())
728}
729
730/// Checks if two constraints are equivalent (define the same geometric parameter).
731fn constraints_are_equivalent(c1: &Constraint, c2: &Constraint) -> bool {
732    match (c1, c2) {
733        (
734            Constraint::Bond {
735                atoms: (a1, b1), ..
736            },
737            Constraint::Bond {
738                atoms: (a2, b2), ..
739            },
740        ) => (a1 == a2 && b1 == b2) || (a1 == b2 && b1 == a2),
741        (
742            Constraint::Angle {
743                atoms: (a1, b1, c1),
744                ..
745            },
746            Constraint::Angle {
747                atoms: (a2, b2, c2),
748                ..
749            },
750        ) => (a1 == a2 && b1 == b2 && c1 == c2) || (a1 == c2 && b1 == b2 && c1 == a2),
751        (
752            Constraint::Dihedral {
753                atoms: (a1, b1, c1, d1),
754                ..
755            },
756            Constraint::Dihedral {
757                atoms: (a2, b2, c2, d2),
758                ..
759            },
760        ) => {
761            (a1 == a2 && b1 == b2 && c1 == c2 && d1 == d2)
762                || (a1 == d2 && b1 == c2 && c1 == b2 && d1 == a2)
763        }
764        _ => false,
765    }
766}
767
768fn parse_scan(line: &str, config: &mut Config) -> Result<()> {
769    let parts: Vec<&str> = line.split_whitespace().collect();
770    if parts.len() < 6 {
771        return Ok(());
772    }
773
774    let scan_type = match parts[1].to_lowercase().as_str() {
775        "r" if parts.len() >= 7 => {
776            let a = parts[2]
777                .parse::<usize>()
778                .map_err(|_| ParseError::Parse("Invalid atom".into()))?
779                - 1;
780            let b = parts[3]
781                .parse::<usize>()
782                .map_err(|_| ParseError::Parse("Invalid atom".into()))?
783                - 1;
784            ScanType::Bond { atoms: (a, b) }
785        }
786        "a" if parts.len() >= 8 => {
787            let a = parts[2]
788                .parse::<usize>()
789                .map_err(|_| ParseError::Parse("Invalid atom".into()))?
790                - 1;
791            let b = parts[3]
792                .parse::<usize>()
793                .map_err(|_| ParseError::Parse("Invalid atom".into()))?
794                - 1;
795            let c = parts[4]
796                .parse::<usize>()
797                .map_err(|_| ParseError::Parse("Invalid atom".into()))?
798                - 1;
799            ScanType::Angle { atoms: (a, b, c) }
800        }
801        _ => return Ok(()),
802    };
803
804    let offset = if matches!(scan_type, ScanType::Bond { .. }) {
805        4
806    } else {
807        5
808    };
809    let start = parts[offset]
810        .parse()
811        .map_err(|_| ParseError::Parse("Invalid start".into()))?;
812    let num_points = parts[offset + 1]
813        .parse()
814        .map_err(|_| ParseError::Parse("Invalid num".into()))?;
815    let step_size = parts[offset + 2]
816        .parse()
817        .map_err(|_| ParseError::Parse("Invalid step".into()))?;
818
819    config.scans.push(ScanSpec {
820        scan_type,
821        start,
822        num_points,
823        step_size,
824    });
825    Ok(())
826}
827
828/// Parses a boolean value from a string, supporting multiple formats.
829///
830/// Supports:
831/// - `true`/`false` (case-insensitive)
832/// - `yes`/`no` (case-insensitive)
833/// - `1`/`0`
834///
835/// Returns `false` for any unrecognized value.
836fn parse_bool(value: &str) -> bool {
837    let value_lower = value.trim().to_lowercase();
838    match value_lower.as_str() {
839        "true" | "yes" | "1" => true,
840        "false" | "no" | "0" => false,
841        _ => false, // Default to false for unrecognized values
842    }
843}
844
845/// Parses configuration from a string content.
846/// All thresholds and coordinates use Angstrom-based units internally.
847pub fn parse_parameter(line: &str, config: &mut Config, fixed_atoms: &mut Vec<usize>) -> Result<()> {
848    let parts: Vec<&str> = line.splitn(2, '=').collect();
849    if parts.len() != 2 {
850        return Ok(());
851    }
852
853    let key = parts[0].trim().to_lowercase();
854    let raw_value = parts[1].trim();
855
856    // Remove inline comments from the value
857    let value = if let Some(comment_pos) = raw_value.find('#') {
858        raw_value[..comment_pos].trim()
859    } else {
860        raw_value
861    };
862
863    match key.as_str() {
864        "nprocs" => config.nprocs = value.parse().unwrap_or(1),
865        "mem" => config.mem = value.to_string(),
866        "charge" => config.charge = value.parse().unwrap_or(0),
867        "charge_b" => {
868            if let Ok(val) = value.parse() {
869                config.charge_b = Some(val);
870            }
871        }
872        "mult_state_a" | "mult_a" => config.mult_state_a = value.parse().unwrap_or(1),
873        "mult_state_b" | "mult_b" => config.mult_state_b = value.parse().unwrap_or(1),
874        "method" => config.method = value.to_string(),
875        "program" => {
876            config.program = match value.to_lowercase().as_str() {
877                "gaussian" => QMProgram::Gaussian,
878                "orca" => QMProgram::Orca,
879                "bagel" => QMProgram::Bagel,
880                "xtb" => QMProgram::Xtb,
881                _ => QMProgram::Gaussian,
882            };
883        }
884        "mode" => {
885            config.run_mode = match value.to_lowercase().as_str() {
886                "read" => RunMode::Read,
887                "noread" => RunMode::NoRead,
888                "stable" => RunMode::Stable,
889                "inter_read" => RunMode::InterRead,
890                _ => RunMode::Normal,
891            };
892        }
893        "td_state_a" | "td_a" => config.td_state_a = value.to_string(),
894        "td_state_b" | "td_b" => config.td_state_b = value.to_string(),
895        "state_a" => config.state_a = value.parse().unwrap_or(0),
896        "state_b" => config.state_b = value.parse().unwrap_or(0),
897        "mp2" => config.mp2 = parse_bool(value),
898        "max_steps" => config.max_steps = value.parse().unwrap_or(100),
899        "max_step_size" => {
900            // User specifies in Angstrom directly
901            config.max_step_size = value.parse().unwrap_or(0.1);
902        }
903        "max_history" => {
904            config.max_history = value.parse().unwrap_or_else(|_| {
905                eprintln!(
906                    "Warning: Invalid max_history value '{}', using default (5)",
907                    value
908                );
909                5
910            });
911        }
912        "fix_de" => config.fix_de = value.parse().unwrap_or(0.0),
913        "delta_e" => config.thresholds.delta_e = value.parse().unwrap_or(0.000050),
914        // Displacement thresholds: users specify in Angstrom
915        "rms_dis" => {
916            config.thresholds.rms_dis = value.parse().unwrap_or(0.0025);
917        }
918        "max_dis" => {
919            config.thresholds.max_dis = value.parse().unwrap_or(0.004);
920        }
921        // Gradient thresholds: input in Ha/Å
922        "max_grad" => {
923            config.thresholds.max_grad = value.parse().unwrap_or(0.0007);
924        }
925        "rms_grad" => {
926            config.thresholds.rms_grad = value.parse().unwrap_or(0.0005);
927        }
928        "bagel_model" => config.bagel_model = value.to_string(),
929        "custom_interface_file" => config.custom_interface_file = value.to_string(),
930        "drive_coordinate" => config.drive_coordinate = value.to_string(),
931        "drive_start" => config.drive_start = value.parse().unwrap_or(0.0),
932        "drive_end" => config.drive_end = value.parse().unwrap_or(0.0),
933        "drive_steps" => config.drive_steps = value.parse().unwrap_or(10),
934        "drive_type" => config.drive_type = value.to_string(),
935        "use_gediis" => {
936            let lower = value.trim().to_lowercase();
937            match lower.as_str() {
938                "blend" | "sequential" => {
939                    config.use_gediis = true;
940                    config.gediis_variant = lower;
941                }
942                _ => config.use_gediis = parse_bool(value),
943            }
944        }
945        "use_hybrid_gediis" => config.use_hybrid_gediis = parse_bool(value),
946        "gediis_blend_mode" => {
947            let lower = value.trim().to_lowercase();
948            match lower.as_str() {
949                "fixed" | "gradient" | "sequential" | "fixed_sequential" => {
950                    config.gediis_blend_mode = lower;
951                }
952                _ => {
953                    eprintln!(
954                        "Warning: Invalid gediis_blend_mode '{}', expected 'fixed', 'fixed_sequential', 'gradient', or 'sequential', using 'fixed'",
955                        value
956                    );
957                }
958            }
959        }
960        "gediis_switch_rms" => {
961            config.gediis_switch_rms = value.parse().unwrap_or_else(|_| {
962                eprintln!("Warning: Invalid gediis_switch_rms '{}', using default 0.005", value);
963                0.005
964            });
965        }
966        "gediis_switch_step" => {
967            config.gediis_switch_step = value.parse().unwrap_or_else(|_| {
968                eprintln!("Warning: Invalid gediis_switch_step '{}', using default 0.001", value);
969                0.001
970            });
971        }
972        "switch_step" => {
973            config.switch_step = value.parse().unwrap_or_else(|_| {
974                eprintln!(
975                    "Warning: Invalid switch_step value '{}', using default (3)",
976                    value
977                );
978                3
979            });
980        }
981        "drive_atoms" => {
982            config.drive_atoms = value
983                .split(',')
984                .filter_map(|s| s.trim().parse::<usize>().ok())
985                .map(|x| x.saturating_sub(1)) // Convert to 0-based indexing
986                .collect();
987        }
988        "fixedatoms" => {
989            for group in value.split(',') {
990                if group.contains('-') {
991                    let range: Vec<&str> = group.split('-').collect();
992                    if range.len() == 2 {
993                        if let (Ok(start), Ok(end)) =
994                            (range[0].parse::<usize>(), range[1].parse::<usize>())
995                        {
996                            for i in start..=end {
997                                fixed_atoms.push(i - 1);
998                            }
999                        }
1000                    }
1001                } else if let Ok(atom) = group.parse::<usize>() {
1002                    fixed_atoms.push(atom - 1);
1003                }
1004            }
1005        }
1006        "gau_comm" => {
1007            config
1008                .program_commands
1009                .insert("gaussian".to_string(), value.to_string());
1010        }
1011        "orca_comm" => {
1012            config
1013                .program_commands
1014                .insert("orca".to_string(), value.to_string());
1015        }
1016        "xtb_comm" => {
1017            config
1018                .program_commands
1019                .insert("xtb".to_string(), value.to_string());
1020        }
1021        "bagel_comm" => {
1022            config
1023                .program_commands
1024                .insert("bagel".to_string(), value.to_string());
1025        }
1026        "isoniom" => config.is_oniom = parse_bool(value),
1027        "chargeandmultforoniom1" => config.charge_and_mult_oniom1 = value.to_string(),
1028        "chargeandmultforoniom2" => config.charge_and_mult_oniom2 = value.to_string(),
1029        "basis" => config.basis_set = value.to_string(),
1030        "solvent" => config.solvent = value.to_string(),
1031        "dispersion" => config.dispersion = value.to_string(),
1032        "restart" => config.restart = parse_bool(value),
1033        "print_checkpoint" => config.print_checkpoint = parse_bool(value),
1034        "smart_history" => config.smart_history = parse_bool(value),
1035        "print_level" => config.print_level = value.parse().unwrap_or(1),
1036        "reduced_factor" => config.reduced_factor = value.parse().unwrap_or(0.5),
1037        "bfgs_rho" => config.bfgs_rho = value.parse().unwrap_or(15.0),
1038        "step_reduction_multiplier" => {
1039            config.step_reduction_multiplier = value.parse().unwrap_or_else(|_| {
1040                eprintln!("Warning: Invalid step_reduction_multiplier '{}', using default 10.0", value);
1041                10.0
1042            });
1043        }
1044        "trust_reduction_factor" => {
1045            config.trust_reduction_factor = value.parse().unwrap_or_else(|_| {
1046                eprintln!("Warning: Invalid trust_reduction_factor '{}', using default 0.5", value);
1047                0.5
1048            });
1049        }
1050        "trust_increase_factor" => {
1051            config.trust_increase_factor = value.parse().unwrap_or_else(|_| {
1052                eprintln!("Warning: Invalid trust_increase_factor '{}', using default 1.2", value);
1053                1.2
1054            });
1055        }
1056        "trust_inc_threshold" => {
1057            config.trust_inc_threshold = value.parse().unwrap_or_else(|_| {
1058                eprintln!("Warning: Invalid trust_inc_threshold '{}', using default 0.0001", value);
1059                0.0001
1060            });
1061        }
1062        "trust_dec_threshold" => {
1063            config.trust_dec_threshold = value.parse().unwrap_or_else(|_| {
1064                eprintln!("Warning: Invalid trust_dec_threshold '{}', using default 0.0001", value);
1065                0.0001
1066            });
1067        }
1068        "trust_min_radius" => {
1069            config.trust_min_radius = value.parse().unwrap_or_else(|_| {
1070                eprintln!("Warning: Invalid trust_min_radius '{}', using default 0.01", value);
1071                0.01
1072            });
1073        }
1074        "trust_max_radius" => {
1075            config.trust_max_radius = value.parse().unwrap_or_else(|_| {
1076                eprintln!("Warning: Invalid trust_max_radius '{}', using default 1.0", value);
1077                1.0
1078            });
1079        }
1080        "steepest_descent_step" => {
1081            config.steepest_descent_step = value.parse().unwrap_or_else(|_| {
1082                eprintln!("Warning: Invalid steepest_descent_step '{}', using default 0.01", value);
1083                0.01
1084            });
1085        }
1086        // New Fortran-ported DIIS options
1087        "use_robust_diis" => config.use_robust_diis = parse_bool(value),
1088        "gediis_variant" => config.gediis_variant = value.to_lowercase(),
1089        "gdiis_cosine_check" => config.gdiis_cosine_check = value.to_lowercase(),
1090        "gdiis_coeff_check" => config.gdiis_coeff_check = value.to_lowercase(),
1091        "n_neg" => config.n_neg = value.parse().unwrap_or(0),
1092        "gediis_sim_switch" => config.gediis_sim_switch = value.parse().unwrap_or(0.0025),
1093        // Hessian update method
1094        "hessian" => {
1095            let lower = value.trim().to_lowercase();
1096            config.hessian_method = match lower.as_str() {
1097                "direct_psb" => HessianMethod::DirectPsb,
1098                "inverse_bfgs" => HessianMethod::InverseBfgs,
1099                "bofill" => HessianMethod::Bofill,
1100                "powell" | "sr1" => HessianMethod::Powell,
1101                "bfgs_powell_mix" | "mix" => HessianMethod::BfgsPowellMix,
1102                _ => {
1103                    eprintln!(
1104                        "Warning: Invalid hessian method '{}', expected 'direct_psb', 'inverse_bfgs', 'bofill', 'powell', or 'bfgs_powell_mix', using 'direct_psb'",
1105                        value
1106                    );
1107                    HessianMethod::DirectPsb
1108                }
1109            };
1110        }
1111        _ => {}
1112    }
1113    Ok(())
1114}
1115
1116/// Filters comments and cleans tail section content.
1117///
1118/// This function processes raw tail section content by:
1119/// - Removing lines that start with '#' (full-line comments)
1120/// - Removing inline comments (text after '#' on the same line as valid keywords)
1121/// - Trimming leading and trailing whitespace from remaining lines
1122/// - Joining valid lines with single spaces
1123/// - Removing empty lines after trimming
1124///
1125/// # Arguments
1126///
1127/// * `raw_content` - The raw tail section content that may contain comments
1128///
1129/// # Returns
1130///
1131/// A cleaned string containing only valid Gaussian keywords separated by spaces
1132///
1133/// # Examples
1134///
1135/// ```
1136/// use omecp::parser::filter_tail_content;
1137///
1138/// let input = "# This is a comment\nTD(NStates=5)\n# Another comment\nRoot=1";
1139/// let result = filter_tail_content(input);
1140/// assert_eq!(result, "TD(NStates=5) Root=1");
1141///
1142/// // Inline comments are also removed
1143/// let inline = "TD(NStates=5) # inline comment\nRoot=1 # another comment";
1144/// let result = filter_tail_content(inline);
1145/// assert_eq!(result, "TD(NStates=5) Root=1");
1146/// ```
1147pub fn filter_tail_content(raw_content: &str) -> String {
1148    raw_content
1149        .lines()
1150        .filter_map(|line| {
1151            let trimmed = line.trim();
1152
1153            // Skip empty lines
1154            if trimmed.is_empty() {
1155                return None;
1156            }
1157
1158            // Skip lines that start with '#' (full-line comments)
1159            if is_comment_line(line) {
1160                return None;
1161            }
1162
1163            // Remove inline comments (everything after '#' on the same line)
1164            let cleaned_line = if let Some(comment_pos) = trimmed.find('#') {
1165                trimmed[..comment_pos].trim()
1166            } else {
1167                trimmed
1168            };
1169
1170            // Return the cleaned line if it's not empty after comment removal
1171            if cleaned_line.is_empty() {
1172                None
1173            } else {
1174                Some(cleaned_line)
1175            }
1176        })
1177        .collect::<Vec<_>>()
1178        .join(" ")
1179}
1180
1181/// Checks if a line is a comment line.
1182///
1183/// A line is considered a comment if it starts with '#' after trimming
1184/// leading whitespace.
1185///
1186/// # Arguments
1187///
1188/// * `line` - The line to check
1189///
1190/// # Returns
1191///
1192/// `true` if the line is a comment, `false` otherwise
1193///
1194/// # Examples
1195///
1196/// ```
1197/// use omecp::parser::is_comment_line;
1198///
1199/// assert_eq!(is_comment_line("# This is a comment"), true);
1200/// assert_eq!(is_comment_line("  # Indented comment"), true);
1201/// assert_eq!(is_comment_line("TD(NStates=5)"), false);
1202/// assert_eq!(is_comment_line(""), false);
1203/// ```
1204pub fn is_comment_line(line: &str) -> bool {
1205    line.trim().starts_with('#')
1206}
1207
1208/// Validates Gaussian keyword syntax.
1209///
1210/// Performs basic validation to ensure the content contains valid Gaussian
1211/// syntax after comment filtering. Checks for:
1212/// - Invalid characters that would break Gaussian input
1213/// - Proper keyword formatting
1214///
1215/// # Arguments
1216///
1217/// * `content` - The cleaned content to validate
1218///
1219/// # Returns
1220///
1221/// `Ok(())` if the content is valid, or `Err(ParseError)` with details
1222/// about the validation failure
1223///
1224/// # Examples
1225///
1226/// ```
1227/// use omecp::parser::validate_gaussian_keywords;
1228///
1229/// // Valid content
1230/// assert!(validate_gaussian_keywords("TD(NStates=5) Root=1").is_ok());
1231///
1232/// // Invalid content with newlines
1233/// assert!(validate_gaussian_keywords("TD(NStates=5)\nRoot=1").is_err());
1234/// ```
1235pub fn validate_gaussian_keywords(content: &str) -> Result<()> {
1236    validate_gaussian_syntax(content, "tail")
1237}
1238
1239/// Validates basic Gaussian syntax patterns.
1240///
1241/// Implements comprehensive validation for Gaussian input syntax to ensure
1242/// that content will not break Gaussian execution. This function checks for:
1243/// - Invalid characters that would cause parsing errors
1244/// - Malformed keyword structures
1245/// - Remaining comment artifacts
1246///
1247/// # Arguments
1248///
1249/// * `content` - The content to validate for Gaussian compatibility
1250/// * `section_name` - The name of the section being validated for error reporting
1251///
1252/// # Returns
1253///
1254/// `Ok(())` if the content is valid, or `Err(ParseError::GaussianSyntaxError)`
1255/// with detailed information about the validation failure
1256///
1257/// # Examples
1258///
1259/// ```
1260/// use omecp::parser::validate_gaussian_syntax;
1261///
1262/// // Valid Gaussian keywords
1263/// assert!(validate_gaussian_syntax("TD(NStates=5) Root=1", "TAIL_a").is_ok());
1264///
1265/// // Invalid content with newlines
1266/// assert!(validate_gaussian_syntax("TD(NStates=5)\nRoot=1", "TAIL_a").is_err());
1267///
1268/// // Invalid content with comment characters
1269/// assert!(validate_gaussian_syntax("TD(NStates=5) # comment", "TAIL_a").is_err());
1270/// ```
1271pub fn validate_gaussian_syntax(content: &str, section_name: &str) -> Result<()> {
1272    if content.is_empty() {
1273        return Ok(());
1274    }
1275
1276    // Check for invalid characters that would break Gaussian input
1277    let invalid_chars = ['\n', '\r', '\t'];
1278
1279    for ch in invalid_chars {
1280        if content.contains(ch) {
1281            let char_name = match ch {
1282                '\n' => "newline",
1283                '\r' => "carriage return",
1284                '\t' => "tab",
1285                _ => "unknown",
1286            };
1287
1288            let suggestion = match ch {
1289                '\n' | '\r' => "Put all keywords on a single line or separate them with spaces instead of line breaks",
1290                '\t' => "Replace tabs with single spaces between keywords",
1291                _ => "Remove this character and use spaces to separate keywords",
1292            };
1293
1294            return Err(ParseError::GaussianSyntaxError {
1295                section: section_name.to_string(),
1296                details: format!("Invalid {} character found in Gaussian keywords. Gaussian route sections must be on a single line. Suggestion: {}", char_name, suggestion),
1297                line_number: None,
1298            });
1299        }
1300    }
1301
1302    // Check for remaining comment characters (should not happen after filtering)
1303    if content.contains('#') {
1304        return Err(ParseError::GaussianSyntaxError {
1305            section: section_name.to_string(),
1306            details: "Comment character '#' found in keywords. Comments are not allowed in Gaussian route sections. Suggestion: Move comments to separate lines in your OpenMECP input file, or remove them entirely from the tail section.".to_string(),
1307            line_number: None,
1308        });
1309    }
1310
1311    // Check for potentially problematic characters that might break Gaussian parsing
1312    let problematic_chars = ['|', '&', ';', '>', '<'];
1313    for ch in problematic_chars {
1314        if content.contains(ch) {
1315            let suggestion = match ch {
1316                '|' => "Use parentheses or commas to separate options instead of pipes",
1317                '&' => "Remove ampersands - they are not valid in Gaussian keywords",
1318                ';' => "Use spaces or commas to separate keywords instead of semicolons",
1319                '>' | '<' => {
1320                    "Remove redirection operators - they are not valid in Gaussian keywords"
1321                }
1322                _ => "Remove this character and check Gaussian manual for proper keyword syntax",
1323            };
1324
1325            return Err(ParseError::GaussianSyntaxError {
1326                section: section_name.to_string(),
1327                details: format!("Potentially problematic character '{}' found. This may cause Gaussian parsing errors. Suggestion: {}", ch, suggestion),
1328                line_number: None,
1329            });
1330        }
1331    }
1332
1333    // Check for unbalanced parentheses which are common in Gaussian keywords
1334    let open_parens = content.chars().filter(|&c| c == '(').count();
1335    let close_parens = content.chars().filter(|&c| c == ')').count();
1336    if open_parens != close_parens {
1337        let suggestion = if open_parens > close_parens {
1338            format!("Add {} closing parenthesis ')'", open_parens - close_parens)
1339        } else {
1340            format!(
1341                "Remove {} extra closing parenthesis ')' or add matching opening parenthesis '('",
1342                close_parens - open_parens
1343            )
1344        };
1345
1346        return Err(ParseError::GaussianSyntaxError {
1347            section: section_name.to_string(),
1348            details: format!("Unbalanced parentheses in Gaussian keywords: {} opening '(', {} closing ')'. Suggestion: {}", open_parens, close_parens, suggestion),
1349            line_number: None,
1350        });
1351    }
1352
1353    // Check for excessive whitespace that might cause issues
1354    if content.contains("  ") {
1355        return Err(ParseError::GaussianSyntaxError {
1356            section: section_name.to_string(),
1357            details: "Multiple consecutive spaces found between keywords. Gaussian prefers single spaces. Suggestion: Replace multiple spaces with single spaces between keywords.".to_string(),
1358            line_number: None,
1359        });
1360    }
1361
1362    // Check for common Gaussian keyword formatting issues
1363    if content.contains("=,") || content.contains(",=") {
1364        return Err(ParseError::GaussianSyntaxError {
1365            section: section_name.to_string(),
1366            details: "Invalid syntax with equals sign and comma combination. Suggestion: Check Gaussian manual for proper keyword=value syntax.".to_string(),
1367            line_number: None,
1368        });
1369    }
1370
1371    // Check for trailing or leading commas which can cause issues
1372    let trimmed = content.trim();
1373    if trimmed.starts_with(',') || trimmed.ends_with(',') {
1374        return Err(ParseError::GaussianSyntaxError {
1375            section: section_name.to_string(),
1376            details: "Keywords start or end with comma, which may cause parsing issues. Suggestion: Remove leading/trailing commas and ensure proper keyword syntax.".to_string(),
1377            line_number: None,
1378        });
1379    }
1380
1381    Ok(())
1382}
1383
1384/// Validates tail section content with enhanced context and error reporting.
1385///
1386/// This function provides comprehensive validation with informational messages
1387/// about empty sections and detailed error reporting for formatting issues.
1388/// It's designed to give users clear guidance on how to fix their input files.
1389///
1390/// The function gracefully handles empty tail sections after comment filtering
1391/// by providing informational context without causing errors. This is important
1392/// because users may have tail sections that contain only comments, which become
1393/// empty after filtering but should not cause the parsing to fail.
1394///
1395/// # Arguments
1396///
1397/// * `content` - The tail section content to validate
1398/// * `section_name` - The name of the section (e.g., "TAIL_a", "TAIL_b") for error reporting
1399///
1400/// # Returns
1401///
1402/// `Ok(())` if the content is valid (including empty content), or `Err(ParseError)`
1403/// with detailed error information and suggestions for fixing issues
1404///
1405/// # Examples
1406///
1407/// ```
1408/// use omecp::parser::validate_tail_section_with_context;
1409///
1410/// // Empty content is handled gracefully
1411/// assert!(validate_tail_section_with_context("", "TAIL_a").is_ok());
1412///
1413/// // Valid content passes validation
1414/// assert!(validate_tail_section_with_context("TD(NStates=5)", "TAIL_a").is_ok());
1415/// ```
1416pub fn validate_tail_section_with_context(content: &str, section_name: &str) -> Result<()> {
1417    // Handle empty content gracefully with informational context
1418    if content.is_empty() {
1419        // Log informational message about empty tail section
1420        // This is acceptable and doesn't cause errors, but provides user awareness
1421        log_empty_tail_section_info(section_name);
1422        return Ok(());
1423    }
1424
1425    // Delegate to the main validation function for non-empty content
1426    validate_tail_section(content, section_name)
1427}
1428
1429/// Validates tail section content with filtering context awareness.
1430///
1431/// This function provides enhanced validation that takes into account the filtering
1432/// process. It validates the content and provides informational messages when
1433/// sections are empty, helping users understand the filtering behavior.
1434///
1435/// # Arguments
1436///
1437/// * `content` - The filtered tail section content to validate
1438/// * `section_name` - The name of the section (e.g., "TAIL_a", "TAIL_b") for error reporting
1439///
1440/// # Returns
1441///
1442/// `Ok(())` if the content is valid (including empty content), or `Err(ParseError)`
1443/// with detailed error information and suggestions for fixing issues
1444pub fn validate_tail_section_with_filtering_context(
1445    content: &str,
1446    section_name: &str,
1447) -> Result<()> {
1448    // Use the existing validation function which already handles empty content gracefully
1449    validate_tail_section_with_context(content, section_name)
1450}
1451
1452/// Logs informational message about empty tail sections after filtering.
1453///
1454/// This function provides user-friendly information when tail sections become
1455/// empty after comment removal. It helps users understand that this is acceptable
1456/// behavior and provides guidance on when they might want to add content.
1457///
1458/// # Arguments
1459///
1460/// * `section_name` - The name of the empty section for context
1461///
1462/// # Implementation Note
1463///
1464/// Currently uses `eprintln!` for immediate user feedback. In a production
1465/// environment, this could be replaced with a proper logging framework
1466/// or structured logging system.
1467fn log_empty_tail_section_info(section_name: &str) {
1468    // Provide informational message to stderr (non-blocking for normal operation)
1469    // This gives users awareness without causing errors or stopping execution
1470    eprintln!(
1471        "INFO: {} section is empty after comment filtering. This is acceptable behavior.",
1472        section_name
1473    );
1474    eprintln!(
1475        "      If you intended to include Gaussian keywords in {}, ensure they are not commented out.",
1476        section_name
1477    );
1478    eprintln!(
1479        "      Common {} keywords: TD(NStates=5,Root=1), TD(NStates=10), etc.",
1480        section_name
1481    );
1482}
1483
1484/// Validates tail section content for Gaussian compatibility.
1485///
1486/// This function performs comprehensive validation of tail section content
1487/// after comment filtering to ensure it will work correctly with Gaussian.
1488/// It checks for proper syntax and provides meaningful error messages with
1489/// specific suggestions for fixing common formatting issues.
1490///
1491/// # Arguments
1492///
1493/// * `content` - The tail section content to validate
1494/// * `section_name` - The name of the section (e.g., "TAIL_a", "TAIL_b") for error reporting
1495///
1496/// # Returns
1497///
1498/// `Ok(())` if the content is valid, or `Err(ParseError)` with details
1499/// about the validation failure including the section name and helpful suggestions
1500///
1501/// # Examples
1502///
1503/// ```
1504/// use omecp::parser::validate_tail_section;
1505///
1506/// // Valid content
1507/// assert!(validate_tail_section("TD(NStates=5) Root=1", "TAIL_a").is_ok());
1508///
1509/// // Empty content is acceptable
1510/// assert!(validate_tail_section("", "TAIL_a").is_ok());
1511///
1512/// // Invalid content with comments (should be filtered before validation)
1513/// assert!(validate_tail_section("TD(NStates=5) # comment", "TAIL_a").is_err());
1514/// ```
1515pub fn validate_tail_section(content: &str, section_name: &str) -> Result<()> {
1516    // Empty content after filtering is acceptable - provide informational context
1517    if content.is_empty() {
1518        return Ok(());
1519    }
1520
1521    // Perform additional tail-section specific validations before general syntax validation
1522
1523    // Check for common TD-DFT keyword patterns and provide specific guidance
1524    if content.to_uppercase().contains("TD") && !content.contains('(') {
1525        let suggestion_message = format!(
1526            "TD keyword found without required parentheses in {} section.\n\nProblem: TD-DFT calculations require parameters in parentheses.\n\nSuggestions:\n  • Use 'TD(NStates=N)' where N is the number of excited states (e.g., TD(NStates=5))\n  • For specific roots: 'TD(NStates=N,Root=M)' where M is the state number\n  • Common examples:\n    - TD(NStates=5,Root=1)  # First excited state, calculate 5 states\n    - TD(NStates=10)        # Calculate 10 excited states",
1527            section_name
1528        );
1529
1530        return Err(ParseError::InvalidTailSection {
1531            section: section_name.to_string(),
1532            message: suggestion_message,
1533            line_number: None,
1534        });
1535    }
1536
1537    // Check for ROOT keyword without TD
1538    if content.to_uppercase().contains("ROOT") && !content.to_uppercase().contains("TD") {
1539        let suggestion_message = format!(
1540            "ROOT keyword found without TD keyword in {} section.\n\nProblem: ROOT is typically used with TD-DFT calculations to specify which excited state to optimize.\n\nSuggestions:\n  • Use 'TD(NStates=N,Root=M)' for combined TD-DFT with root specification\n  • Example: 'TD(NStates=5,Root=1)' to optimize the first excited state\n  • If you need separate keywords: 'TD(NStates=5) Root=1'",
1541            section_name
1542        );
1543
1544        return Err(ParseError::InvalidTailSection {
1545            section: section_name.to_string(),
1546            message: suggestion_message,
1547            line_number: None,
1548        });
1549    }
1550
1551    // Check for common misspellings or formatting issues
1552    let common_issues = [
1553        (
1554            "NSTATES",
1555            "Use 'NStates' instead of 'NSTATES' (case-sensitive)",
1556        ),
1557        (
1558            "nstates",
1559            "Use 'NStates' instead of 'nstates' (case-sensitive)",
1560        ),
1561        ("root=", "Use 'Root=' instead of 'root=' (case-sensitive)"),
1562        ("ROOT=", "Use 'Root=' instead of 'ROOT=' (case-sensitive)"),
1563        ("td(", "Use 'TD(' instead of 'td(' (case-sensitive)"),
1564    ];
1565
1566    for (wrong, suggestion) in &common_issues {
1567        if content.contains(wrong) {
1568            return Err(ParseError::InvalidTailSection {
1569                section: section_name.to_string(),
1570                message: format!(
1571                    "Potential keyword formatting issue: found '{}'. Suggestion: {}",
1572                    wrong, suggestion
1573                ),
1574                line_number: None,
1575            });
1576        }
1577    }
1578
1579    // Check for missing equals signs in keyword=value pairs
1580    if content.contains("NStates") && !content.contains("NStates=") {
1581        let suggestion_message = format!(
1582            "NStates keyword found without value assignment in {} section.\n\nProblem: NStates requires a numeric value to specify how many excited states to calculate.\n\nSuggestions:\n  • Use 'NStates=N' where N is the number of excited states\n  • Common values: NStates=5, NStates=10, NStates=20\n  • Example: 'TD(NStates=5,Root=1)' or 'TD(NStates=5) Root=1'",
1583            section_name
1584        );
1585
1586        return Err(ParseError::InvalidTailSection {
1587            section: section_name.to_string(),
1588            message: suggestion_message,
1589            line_number: None,
1590        });
1591    }
1592
1593    if content.contains("Root") && !content.contains("Root=") {
1594        let suggestion_message = format!(
1595            "Root keyword found without value assignment in {} section.\n\nProblem: Root requires a numeric value to specify which excited state to optimize.\n\nSuggestions:\n  • Use 'Root=N' where N is the excited state number (starting from 1)\n  • Example: 'Root=1' for first excited state, 'Root=2' for second excited state\n  • Complete example: 'TD(NStates=5,Root=1)' or 'TD(NStates=5) Root=1'",
1596            section_name
1597        );
1598
1599        return Err(ParseError::InvalidTailSection {
1600            section: section_name.to_string(),
1601            message: suggestion_message,
1602            line_number: None,
1603        });
1604    }
1605
1606    // Validate the content using the existing Gaussian keyword validation
1607    validate_gaussian_keywords(content).map_err(|e| match e {
1608        ParseError::GaussianSyntaxError {
1609            details,
1610            line_number,
1611            ..
1612        } => {
1613            // Create a user-friendly error message with specific suggestions
1614            let user_friendly_message =
1615                create_user_friendly_tail_error(&details, section_name, content);
1616
1617            ParseError::InvalidTailSection {
1618                section: section_name.to_string(),
1619                message: user_friendly_message,
1620                line_number,
1621            }
1622        }
1623        _ => {
1624            // Handle other types of validation errors
1625            let fallback_message = format!(
1626                "Validation failed for {} section: {}\n\n{}",
1627                section_name,
1628                e,
1629                get_tail_section_suggestions(content, section_name)
1630            );
1631
1632            ParseError::InvalidTailSection {
1633                section: section_name.to_string(),
1634                message: fallback_message,
1635                line_number: None,
1636            }
1637        }
1638    })
1639}
1640
1641fn read_geom_from_xyz(path: &Path) -> Result<(Vec<String>, Vec<f64>)> {
1642    let content = fs::read_to_string(path)?;
1643    let mut elements = Vec::new();
1644    let mut coords = Vec::new();
1645
1646    for line in content.lines() {
1647        let line = line.trim();
1648        if line.is_empty() || !line.chars().next().is_some_and(|c| c.is_alphabetic()) {
1649            continue;
1650        }
1651        let parts: Vec<&str> = line.split_whitespace().collect();
1652        if parts.len() >= 4 {
1653            elements.push(parts[0].to_string());
1654            for &coord_str in &parts[1..4] {
1655                coords.push(
1656                    coord_str
1657                        .parse()
1658                        .map_err(|_| ParseError::Parse("Invalid coordinate in XYZ file".into()))?,
1659                );
1660            }
1661        }
1662    }
1663    Ok((elements, coords))
1664}
1665
1666fn read_geom_from_gjf(path: &Path) -> Result<(Vec<String>, Vec<f64>)> {
1667    let content = fs::read_to_string(path)?;
1668    let mut elements = Vec::new();
1669    let mut coords = Vec::new();
1670    let mut in_geom = false;
1671
1672    for line in content.lines() {
1673        let line = line.trim();
1674        if line.is_empty() {
1675            continue;
1676        }
1677        if line.contains("0 1") || line.contains("0 2") || line.contains("0 3") {
1678            in_geom = true;
1679            continue;
1680        }
1681        if in_geom && line.chars().next().is_some_and(|c| c.is_alphabetic()) {
1682            let parts: Vec<&str> = line.split_whitespace().collect();
1683            if parts.len() >= 4 {
1684                elements.push(parts[0].to_string());
1685                for &coord_str in &parts[1..4] {
1686                    coords.push(
1687                        coord_str.parse().map_err(|_| {
1688                            ParseError::Parse("Invalid coordinate in GJF file".into())
1689                        })?,
1690                    );
1691                }
1692            }
1693        } else if in_geom && line.chars().next().is_some_and(|c| c.is_ascii_digit()) {
1694            break;
1695        }
1696    }
1697    Ok((elements, coords))
1698}
1699
1700fn read_geom_from_log(path: &Path) -> Result<(Vec<String>, Vec<f64>)> {
1701    let content = fs::read_to_string(path)?;
1702    let mut elements = Vec::new();
1703    let mut coords = Vec::new();
1704    let mut in_geom = false;
1705
1706    for line in content.lines() {
1707        if line.contains("Input orientation") {
1708            in_geom = true;
1709            elements.clear();
1710            coords.clear();
1711            continue;
1712        } else if line.contains("Distance matrix") || line.contains("Rotational constants") {
1713            in_geom = false;
1714            continue;
1715        }
1716        if in_geom {
1717            let parts: Vec<&str> = line.split_whitespace().collect();
1718            if parts.len() >= 6 && parts[0].chars().all(|c| c.is_ascii_digit()) {
1719                // Atomic number, atomic number, 0, x, y, z
1720                elements.push(parts[1].to_string());
1721                for &coord_str in &parts[3..6] {
1722                    coords.push(
1723                        coord_str.parse().map_err(|_| {
1724                            ParseError::Parse("Invalid coordinate in LOG file".into())
1725                        })?,
1726                    );
1727                }
1728            }
1729        }
1730    }
1731    Ok((elements, coords))
1732}
1733
1734/// Provides suggestions for fixing common tail section formatting issues.
1735///
1736/// This function analyzes the content and provides specific, actionable
1737/// suggestions based on common mistakes users make when writing tail sections.
1738///
1739/// # Arguments
1740///
1741/// * `content` - The problematic content to analyze
1742/// * `section_name` - The name of the section for context
1743///
1744/// # Returns
1745///
1746/// A string containing specific suggestions for fixing the issues
1747fn get_tail_section_suggestions(content: &str, section_name: &str) -> String {
1748    let mut suggestions = Vec::new();
1749
1750    // Check for common patterns and provide specific advice
1751    if content.contains('\n') {
1752        suggestions.push("Put all keywords on a single line separated by spaces".to_string());
1753    }
1754
1755    if content.contains('#') {
1756        suggestions.push("Move comments to separate lines outside the tail section".to_string());
1757    }
1758
1759    if content.to_uppercase().contains("TD") && !content.contains('(') {
1760        suggestions.push("Add parentheses to TD keyword: TD(NStates=5,Root=1)".to_string());
1761    }
1762
1763    if content.contains("  ") {
1764        suggestions.push("Replace multiple spaces with single spaces between keywords".to_string());
1765    }
1766
1767    if content.chars().filter(|&c| c == '(').count()
1768        != content.chars().filter(|&c| c == ')').count()
1769    {
1770        suggestions.push("Check that all parentheses are properly balanced".to_string());
1771    }
1772
1773    // Add general guidance if no specific issues found
1774    if suggestions.is_empty() {
1775        suggestions.push(format!("Ensure {} contains valid Gaussian keywords (e.g., 'TD(NStates=5,Root=1)' for TD-DFT calculations)", section_name));
1776        suggestions.push("Check the Gaussian manual for proper keyword syntax".to_string());
1777        suggestions.push("Remove any shell command characters or special symbols".to_string());
1778    }
1779
1780    format!(
1781        "Suggestions for fixing {} section:\n  • {}",
1782        section_name,
1783        suggestions.join("\n  • ")
1784    )
1785}
1786
1787/// Creates a user-friendly error message for tail section validation failures.
1788///
1789/// This function takes technical validation errors and converts them into
1790/// clear, actionable error messages that help users understand and fix
1791/// their input file formatting issues.
1792///
1793/// # Arguments
1794///
1795/// * `error_details` - The technical error details from validation
1796/// * `section_name` - The name of the section where the error occurred
1797/// * `content` - The problematic content for generating suggestions
1798///
1799/// # Returns
1800///
1801/// A formatted error message with context and suggestions
1802fn create_user_friendly_tail_error(
1803    error_details: &str,
1804    section_name: &str,
1805    content: &str,
1806) -> String {
1807    let suggestions = get_tail_section_suggestions(content, section_name);
1808
1809    format!(
1810        "Problem in {} section: {}\n\n{}\n\nExample of valid {} content:\n  TD(NStates=5,Root=1)\n  or\n  TD(NStates=10) Root=2",
1811        section_name,
1812        error_details,
1813        suggestions,
1814        section_name
1815    )
1816}
1817
1818fn read_external_geometry(path: &Path) -> Result<(Vec<String>, Vec<f64>)> {
1819    let path_str = path.to_string_lossy();
1820    if path_str.ends_with(".xyz") {
1821        read_geom_from_xyz(path)
1822    } else if path_str.ends_with(".gjf") {
1823        read_geom_from_gjf(path)
1824    } else if path_str.ends_with(".log") {
1825        read_geom_from_log(path)
1826    } else {
1827        Err(ParseError::Parse(
1828            "Unsupported external geometry file format".into(),
1829        ))
1830    }
1831}
omecp/parser.rs

omecp/
parser.rs