Alex Raster Alex Raster - 3 months ago 21
Python Question

Iterating through capture fields in a Rust regex

I'm playing around with the frowns parser available from http:// frowns.sourceforge.net, a parser that tokenizes SMILES standard chemical formula strings. Specifically I'm trying to port it to Rust.

The original regex for an "atom" token in the parser looks like this (Python):

element_symbols_pattern = \
r"C[laroudsemf]?|Os?|N[eaibdpos]?|S[icernbmg]?|P[drmtboau]?|" \
r"H[eofgas]?|c|n|o|s|p|A[lrsgutcm]|B[eraik]?|Dy|E[urs]|F[erm]?|" \
r"G[aed]|I[nr]?|Kr?|L[iaur]|M[gnodt]|R[buhenaf]|T[icebmalh]|" \
r"U|V|W|Xe|Yb?|Z[nr]|\*"

atom_fields = [
"raw_atom",
"open_bracket",
"weight",
"element",
"chiral_count",
"chiral_named",
"chiral_symbols",
"hcount",
"positive_count",
"positive_symbols",
"negative_count",
"negative_symbols",
"error_1",
"error_2",
"close_bracket",
"error_3",
]

atom = re.compile(r"""
(?P<raw_atom>Cl|Br|[cnospBCNOFPSI]) | # "raw" means outside of brackets
(
(?P<open_bracket>\[) # Start bracket
(?P<weight>\d+)? # Atomic weight (optional)
( # valid term or error
( # valid term
(?P<element>""" + element_symbols_pattern + r""") # element or aromatic
( # Chirality can be
(?P<chiral_count>@\d+) | # @1 @2 @3 ...
(?P<chiral_named> # or
@TH[12] | # @TA1 @TA2
@AL[12] | # @AL1 @AL2
@SP[123] | # @SP1 @SP2 @SP3
@TB(1[0-9]?|20?|[3-9]) | # @TB{1-20}
@OH(1[0-9]?|2[0-9]?|30?|[4-9])) | # @OH{1-30}
(?P<chiral_symbols>@+) # or @@@@@@@...
)? # and chirality is optional
(?P<hcount>H\d*)? # Optional hydrogen count
( # Charges can be
(?P<positive_count>\+\d+) | # +<number>
(?P<positive_symbols>\++) | # +++... This includes the single '+'
(?P<negative_count>-\d+) | # -<number>
(?P<negative_symbols>-+) # ---... including a single '-'
)? # and are optional
(?P<error_1>[^\]]+)? # If there's anything left, it's an error
) | ( # End of parsing stuff in []s, except
(?P<error_2>[^\]]*) # If there was an error, we get here
))
((?P<close_bracket>\])| # End bracket
(?P<error_3>$)) # unexpectedly reached end of string
)
""", re.X)


The field list is used to improve the reportability of the regex parser, as well as track parsing errors.

I wrote something that compiles and parses tokens without brackets properly, but something about the inclusion of brackets (such as
[S]
instead of
S
) breaks it. So I've narrowed it down with comments:

extern crate regex;
use regex::Regex;

fn main() {
let atom_fields: Vec<&'static str> = vec![
"raw_atom",
"open_bracket",
"weight",
"element",
"chiral_count",
"chiral_named",
"chiral_symbols",
"hcount",
"positive_count",
"positive_symbols",
"negative_count",
"negative_symbols",
"error_1",
"error_2",
"close_bracket",
"error_3"
];

const EL_SYMBOLS: &'static str = r#"(?P<element>S?|\*")"#;
let atom_re_str: &String = &String::from(vec![
// r"(?P<raw_atom>Cl|Br|[cnospBCNOFPSI])|", // "raw" means outside of brackets
r"(",
r"(?P<open_bracket>\[)", // Start bracket
// r"(?P<weight>\d+)?", // Atomic weight (optional)
r"(", // valid term or error
r"(", // valid term
&EL_SYMBOLS, // element or aromatic
// r"(", // Chirality can be
// r"(?P<chiral_count>@\d+)|", // @1 @2 @3 ...
// r"(?P<chiral_named>", // or
// r"@TH[12]|", // @TA1 @TA2
// r"@AL[12]|", // @AL1 @AL2
// r"@SP[123]|", // @SP1 @SP2 @SP3
// r"@TB(1[0-9]?|20?|[3-9])|", // @TB{1-20}
// r"@OH(1[0-9]?|2[0-9]?|30?|[4-9]))|", // @OH{1-30}
// r"(?P<chiral_symbols>@+)", // or @@@@....,
// r")?", // and chirality is optional
// r"(?P<hcount>H\d*)?", // Optional hydrogen count
// r"(", // Charges can be
// r"(?P<positive_count>\+\d+)|", // +<number>
// r"(?P<positive_symbols>\++)|", // +++...including a single '+'
// r"(?P<negative_count>-\d+)|", // -<number>
// r"(?P<negative_symbols>-+)", // ---... including a single '-'
// r")?", // and are optional
// r"(?P<error_1>[^\]]+)?", // anything left is an error
r")", // End of stuff in []s, except
r"|((?P<error_2>[^\]]*)", // If other error, we get here
r"))",
r"((?P<close_bracket>\])|", // End bracket
r"(?P<error_3>$)))"].join("")); // unexpected end of string

println!("generated regex: {}", &atom_re_str);
let atom_re = Regex::new(&atom_re_str).unwrap();

for cur_char in "[S]".chars() {
let cur_string = cur_char.to_string();
println!("cur string: {}", &cur_string);
let captures = atom_re.captures(&cur_string.as_str()).unwrap();
// if captures.name("atom").is_some() {
// for cur_field in &atom_fields {
// let field_capture = captures.name(cur_field);
// if cur_field.contains("error") {
// if *cur_field == "error_3" {
// // TODO replace me with a real error
// println!("current char: {:?}", &cur_char);
// panic!("Missing a close bracket (]). Looks like: {}.",
// field_capture.unwrap());
// } else {
// panic!("I don't recognize the character. Looks like: {}.",
// field_capture.unwrap());
// }
// } else {
// println!("ok! matched {:?}", &cur_char);
// }
// }
// }
}
}


--

You can see that the generated Rust regex works in Debuggex:

((?P<open_bracket>\[)(((?P<element>S?|\*"))|((?P<error_2>[^\]]*)))((?P<close_bracket>\])|(?P<error_3>$)))


Regular expression visualization

(http://debuggex.com/r/7j75Y2F1ph1v9jfL)

If you run the example (https://gitlab.com/araster/frowns_regex), you'll see that the open bracket parses correctly, but the
.captures().unwrap()
dies on the next character 'S'. If I use the complete expression I can parse all kinds of things from the frowns test file, as long as they don't have brackets.

What am I doing wrong?

Answer

You are iterating on each character of your input string and trying to match the regex on a string composed of a single character. However, this regex is not designed to match individual characters. Indeed, the regex will match [S] as a whole.

If you want to be able to find multiple matches in a single string, use captures_iter instead of captures to iterate on all matches and their respective captures (each match will be a formula, the regex will skip text that doesn't match a formula).

for captures in atom_re.captures_iter("[S]") {
    // check the captures of each match
}

If you only want to find the first match in a string, then use captures on the whole string, rather than on each individual character.

Comments