leetcode/
tag_validator.rs

1//! # 591. 标签验证器
2//!
3//! 难度 困难
4//!
5//! 给定一个表示代码片段的字符串，你需要实现一个验证器来解析这段代码，并返回它是否合法。合法的代码片段需要遵守以下的所有规则：
6//!
7//! 1. 代码必须被合法的闭合标签包围。否则，代码是无效的。
8//! 2. 闭合标签（不一定合法）要严格符合格式：`<TAG_NAME>TAG_CONTENT</TAG_NAME>`。其中，`<TAG_NAME>` 是起始标签，`</TAG_NAME>` 是结束标签。起始和结束标签中的 `TAG_NAME` 应当相同。当且仅当 `TAG_NAME` 和 `TAG_CONTENT` 都是合法的，闭合标签才是合法的。
9//! 3. 合法的 `TAG_NAME` 仅含有大写字母，长度在范围 `[1,9]` 之间。否则，该 `TAG_NAME` 是不合法的。
10//! 4. 合法的 `TAG_CONTENT` 可以包含其他合法的闭合标签，`cdata` （请参考规则7）和任意字符（注意参考规则1）除了不匹配的 `<`、不匹配的起始和结束标签、不匹配的或带有不合法 `TAG_NAME` 的闭合标签。否则，`TAG_CONTENT` 是不合法的。
11//! 5. 一个起始标签，如果没有具有相同 `TAG_NAME` 的结束标签与之匹配，是不合法的。反之亦然。不过，你也需要考虑标签嵌套的问题。
12//! 6. 一个<，如果你找不到一个后续的 `>` 与之匹配，是不合法的。并且当你找到一个 `<` 或 `</` 时，所有直到下一个 `>` 的前的字符，都应当被解析为 `TAG_NAME`（不一定合法）。
13//! 7. `cdata` 有如下格式：`<![CDATA[CDATA_CONTENT]]>` 。`CDATA_CONTENT` 的范围被定义成 `<![CDATA[ 和后续的第一个 ]]>` 之间的字符。
14//! 8. `CDATA_CONTENT` 可以包含任意字符。`cdata` 的功能是阻止验证器解析 `CDATA_CONTENT`，所以即使其中有一些字符可以被解析为标签（无论合法还是不合法），也应该将它们视为常规字符。
15//!
16//! 合法代码的例子:
17//!
18//! ```text
19//! 输入: "<DIV>This is the first line <![CDATA[<div>]]></DIV>"
20//!
21//! 输出: True
22//!
23//! 解释:
24//!
25//! 代码被包含在了闭合的标签内： <DIV> 和 </DIV> 。
26//!
27//! TAG_NAME 是合法的，TAG_CONTENT 包含了一些字符和 cdata 。
28//!
29//! 即使 CDATA_CONTENT 含有不匹配的起始标签和不合法的 TAG_NAME，它应该被视为普通的文本，而不是标签。
30//!
31//! 所以 TAG_CONTENT 是合法的，因此代码是合法的。最终返回True。
32//!
33//!
34//! 输入: "<DIV>>>  ![cdata[]] <![CDATA[<div>]>]]>]]>>]</DIV>"
35//!
36//! 输出: True
37//!
38//! 解释:
39//!
40//! 我们首先将代码分割为： start_tag|tag_content|end_tag 。
41//!
42//! start_tag -> "<DIV>"
43//!
44//! end_tag -> "</DIV>"
45//!
46//! tag_content 也可被分割为： text1|cdata|text2 。
47//!
48//! text1 -> ">>  ![cdata[]] "
49//!
50//! cdata -> "<![CDATA[<div>]>]]>" ，其中 CDATA_CONTENT 为 "<div>]>"
51//!
52//! text2 -> "]]>>]"
53//!
54//!
55//! start_tag 不是 "<DIV>>>" 的原因参照规则 6 。
56//! cdata 不是 "<![CDATA[<div>]>]]>]]>" 的原因参照规则 7 。
57//! ```
58//!
59//! 不合法代码的例子:
60//!
61//! ```text
62//! 输入: "<A>  <B> </A>   </B>"
63//! 输出: False
64//! 解释: 不合法。如果 "<A>" 是闭合的，那么 "<B>" 一定是不匹配的，反之亦然。
65//!
66//! 输入: "<DIV>  div tag is not closed  <DIV>"
67//! 输出: False
68//!
69//! 输入: "<DIV>  unmatched <  </DIV>"
70//! 输出: False
71//!
72//! 输入: "<DIV> closed tags with invalid tag name  <b>123</b> </DIV>"
73//! 输出: False
74//!
75//! 输入: "<DIV> unmatched tags with invalid tag name  </1234567890> and <CDATA[[]]>  </DIV>"
76//! 输出: False
77//!
78//! 输入: "<DIV>  unmatched start tag <B>  and unmatched end tag </C>  </DIV>"
79//! 输出: False
80//! ```
81//!
82//! ## 注意:
83//!
84//! 为简明起见，你可以假设输入的代码（包括提到的任意字符）只包含`数字`, `字母`, `'<','>','/','!','[',']'` 和 `' '`。
85//!
86//! See [leetcode](https://leetcode-cn.com/problems/tag-validator/)
87
88/// 用有限状态机，处理几个有限的状态就解决了。
89///
90/// Rust 的好处是很容易写比较高级的抽象。而且**对错误处理非常友好**。语法上主要是 Pattern Matching 和各种高阶函数的 combinator 组合。感觉用 Erlang/Elixir 写起来应该会很方便。
91///
92/// 代码逻辑：
93///
94/// 1. `chars()` 生成字符串的 [`Iterator`](Iterator)
95/// 2. `try_fold` 消费这个 [`Iterator`](Iterator)，创建一个新的 `Validator` 对象作为 fold 的初始状态传入。
96/// 3. `try_fold` 的每一次回调处理一个字符，调用 `validator.handle(c)` 返回一个 `Result`。 只要每步都 Ok，那就一直 handle 下去，只要中间发现 Err，就会提早返回 (短路操作)
97/// 4. 最后消费完 [`Iterator`](Iterator) 也就是所有字符都处理完了，需要判断下 `Validator.is_end()` 是否合法结束，即检查 `Validator` 的状态是否是 `End`（完全闭合最外层的标签）。
98///
99/// 主要是 `Validator` 里的几个状态定义：
100///
101/// * `Init` 初始状态，只接收 `<` 就进入 `TagName` 状态，否则失败
102/// * `TagName` 这个状态比较复杂，其实就是 Tag 的定义条件，这里用了一个 `is_close` 来标识是不是 **关闭标签**
103/// * `TagContent` 这个状态简单，只要遇到 `<` 就进入 `TagName` 状态，其它都 Ok
104/// * `CDataTag` 这个也只有一种情况，读取完整的 `[CDATA[` 进入 `CDataContent` 状态，否则失败
105/// * `CDataContent` 这个状态也简单（用了两个 `bool` 标记是否已经有连续两个 `]` ），遇到 `]]>` 就结束，回到 `TagContent` 状态（注意 reset 标记）
106/// * `End` 最外层的 Tag 闭合，结束，不能再有内容（即如果这个状态下再接收到内容，都是 Err）
107///
108/// 这里就 `TagName` 里处理得最多，具体条件结合题目定义，看代码。
109///
110/// 另外，`Validator` 里定义了一个 `stack`，记录 **开标签**，用于与 **闭标签** 匹配。
111///
112/// Rust 的类型系统使得错误处理机制非常严谨，安全。代码里每一个地方都记录了 invalid 的错误信息，可以在最外层调用的地方统一打印。
113///
114/// 因为用 [`Iterator`](Iterator)，是 lazy 调用，只有在每一步迭代的时候才会执行代码，只迭代了一遍。（如果要调试输入的情况，在 `try_fold`
115/// 前面加一个 `inspect` 可以打印出每个字符，结合每个地方的状态变化，可以很轻松地调试）。
116/// 因为代码抽象程度比较好，虽然看起来代码长了一点，但结构非常清晰，可读性，安全性，可扩展/维护性都很优秀。
117pub struct Solution;
118
119impl Solution {
120    pub fn is_valid(code: String) -> bool {
121        code.chars().try_fold(Validator::new(), |mut validator, c| {
122            validator.handle(c).map(|_| validator)
123        })
124        .and_then(|validator| match validator.is_end() {
125            true => Ok(()),
126            _ => Err("EOF"),
127        })
128        // .map_err(|e| {
129        //     eprintln!("Invalid, err: {}", e);
130        // })
131        .is_ok()
132    }
133}
134
135type Result = std::result::Result<(), &'static str>;
136
137enum State {
138    Init,
139    TagName {
140        cache: String,
141        is_close: bool,
142    },
143    TagContent,
144    CDataTag {
145        cache: String,
146    },
147    CDataContent {
148        // stand for close prefix `]]`
149        prefix: (bool, bool),
150    },
151    End,
152}
153
154struct Validator {
155    state: State,
156    stack: Vec<String>,
157}
158
159impl Validator {
160    fn new() -> Self {
161        Self {
162            state: State::Init,
163            stack: Vec::new(),
164        }
165    }
166}
167
168impl Validator {
169    fn is_end(&self) -> bool {
170        match &self.state {
171            State::End => true,
172            _ => false,
173        }
174    }
175
176    fn handle(&mut self, c: char) -> Result {
177        match &self.state {
178            State::Init => self.handle_init(c),
179            State::TagName { .. } => self.handle_tag_name(c),
180            State::TagContent => self.handle_tag_content(c),
181            State::CDataTag { .. } => self.handle_cdata_tag(c),
182            State::CDataContent { .. } => self.handle_cdata_content(c),
183            State::End => self.handle_end(c),
184        }
185    }
186
187    // In the Init state, only accept '<' character then go into next state: TagName
188    fn handle_init(&mut self, c: char) -> Result {
189        match (&self.state, c) {
190            (State::Init, '<') => {
191                self.state = State::TagName {
192                    cache: String::new(),
193                    is_close: false,
194                };
195                Ok(())
196            }
197            (State::Init, _) => Err("Expect <"),
198            _ => panic!("Invalid State, expect Init"),
199        }
200    }
201
202    fn handle_tag_name(&mut self, c: char) -> Result {
203        match self.state {
204            State::TagName { ref mut cache, ref mut is_close } => {
205                match (c, cache.len(), &is_close) {
206                    // meet 'A'..='Z', tag name cache should less then 9
207                    (c @ 'A' ..= 'Z', 0..=8, _) => {
208                        cache.push(c);
209                        Ok(())
210                    }
211                    // meet '>', cache length should in 1..=9
212                    // if not is_close then into TagContent
213                    ('>', 1..=9, false) => {
214                        self.stack.push(cache.to_string());
215                        self.state = State::TagContent;
216                        Ok(())
217                    }
218                    // if is_close then into TagContent or End
219                    ('>', 1..=9, true) => {
220                        match self.stack.pop() {
221                            Some(s) if s == *cache => {
222                                if self.stack.len() > 0 {
223                                    // inner level tag closed, into outer TagContent
224                                    self.state = State::TagContent;
225                                } else {
226                                    // top level tag closed, into End
227                                    self.state = State::End;
228                                }
229                                Ok(())
230                            }
231                            _ => Err("Wrong close tag"),
232                        }
233                    }
234                    // meet '/' only when not is_close and cache is empty
235                    ('/', 0, false) => {
236                        *is_close = true;
237                        Ok(())
238                    }
239                    // otherwise invalid tag character
240                    // ('/', _, true) | ('/', _, false) => Err("Invalid tag char"),
241                    // meet '!' only when cache is empty & not !is_close, which means '!' just after '<'
242                    ('!', 0, false) if !self.stack.is_empty() => {
243                        self.state = State::CDataTag { cache: String::new() };
244                        Ok(())
245                    }
246                    // you can match more different conditions for more error detail
247                    _ => Err("Invalid character in TagName State"),
248                }
249            }
250            _ => panic!("Invalid State, expect TagName"),
251        }
252    }
253
254    fn handle_tag_content(&mut self, c: char) -> Result {
255        match (&self.state, c) {
256            // if meet '<', then into inner TagName
257            (State::TagContent, '<') => {
258                self.state = State::TagName {
259                    cache: String::new(),
260                    is_close: false,
261                };
262                Ok(())
263            }
264            // otherwise all characters would be ok
265            (State::TagContent, _) => Ok(()),
266            _ => panic!("Invalid State, expect TagContent"),
267        }
268    }
269
270    fn handle_cdata_tag(&mut self, c: char) -> Result {
271        match self.state {
272            State::CDataTag { ref mut cache } => {
273                match (c, cache.len()) {
274                    (c, 0..=6) => {
275                        cache.push(c);
276                        if "[CDATA[" == cache.as_str() {
277                            self.state = State::CDataContent { prefix: (false, false) };
278                            Ok(())
279                        } else if "[CDATA[".starts_with(cache.as_str()) {
280                            Ok(())
281                        } else {
282                            Err("Wrong CDataTag character")
283                        }
284                    }
285                    _ => Err("CDataTag invalid"),
286                }
287            }
288            _ => panic!("Invalid State, expect CDataTag"),
289        }
290    }
291
292    fn handle_cdata_content(&mut self, c: char) -> Result {
293        match self.state {
294            State::CDataContent { ref mut prefix } => {
295                match (&prefix.0, &prefix.1, c) {
296                    (true, true, '>') => {
297                        self.state = State::TagContent;
298                        Ok(())
299                    }
300                    (true, true, ']') => Ok(()),
301                    (true, true, _) => {
302                        // reset prefix
303                        *prefix = (false, false);
304                        Ok(())
305                    }
306                    (true, false, ']') => {
307                        *prefix = (true, true);
308                        Ok(())
309                    }
310                    (false, false, ']') => {
311                        *prefix = (true, false);
312                        Ok(())
313                    }
314                    _ => Ok(()),
315                }
316            }
317            _ => panic!("Invalid State, expect CDataContent"),
318        }
319    }
320
321    fn handle_end(&mut self, _c: char) -> Result {
322        match self.state {
323            State::End => Err("End of code"),
324            _ => panic!("Invalid State, expect End"),
325        }
326    }
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332
333    #[test]
334    fn test() {
335        let cases = vec![
336            (true, "<DIV>This is the first line <![CDATA[<div>]]></DIV>"),
337            (true, "<DIV>>>  ![cdata[]] <![CDATA[<div>]>]]>]]>>]</DIV>"),
338            (true, "<TRUE><![CDATA[wahaha]]]><![CDATA[]> wahaha]]></TRUE>"),
339
340            (false, "<DIV>>>  ![cdata[]] </![CDATA[<div>]>]]>]]>>]</DIV>"),
341            (false, "<A>  <B> </A>   </B>"),
342            (false, "<DIV>  div tag is not closed  <DIV>"),
343            (false, "<DIV>  unmatched <  </DIV>"),
344            (false, "<DIV> closed tags with invalid tag name  <b>123</b> </DIV>"),
345            (false, "<DIV> unmatched tags with invalid tag name  </1234567890> and <CDATA[[]]>  </DIV>"),
346            (false, "<DIV>  unmatched start tag <B>  and unmatched end tag </C>  </DIV>"),
347            (false, "<A><![CDATA[</A>]]123></A>"),
348        ];
349        for (expect, arg) in cases {
350            assert_eq!(expect, Solution::is_valid(arg.into()));
351        }
352    }
353}
leetcode/tag_validator.rs

leetcode/
tag_validator.rs