1use hyperon_atom::*;
4use hyperon_atom::gnd::*;
5
6use core::ops::Range;
7use regex::Regex;
8use std::rc::Rc;
9use unicode_reader::CodePoints;
10use std::io;
11use std::io::Read;
12
13#[derive(Clone, Debug)]
14pub struct Tokenizer {
15 tokens: Vec<TokenDescr>,
16}
17
18#[derive(Clone)]
19struct TokenDescr {
20 regex: Regex,
21 constr: Rc<AtomConstr>,
22}
23
24impl std::fmt::Debug for TokenDescr {
25 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
26 write!(f, "TokenDescr{{ regex: {:?}, constr: {:?} }}", self.regex, Rc::as_ptr(&self.constr))
27 }
28}
29
30type AtomConstr = dyn Fn(&str) -> Result<Atom, String>;
31
32impl Tokenizer {
33
34 pub fn new() -> Self {
35 Self{ tokens: Vec::new() }
36 }
37
38 pub fn register_token<C: 'static + Fn(&str) -> Atom>(&mut self, regex: Regex, constr: C) {
39 self.register_token_with_func_ptr(regex, Rc::new(move |the_str| Ok(constr(the_str))))
40 }
41
42 pub fn register_fallible_token<C: 'static + Fn(&str) -> Result<Atom, String>>(&mut self, regex: Regex, constr: C) {
43 self.register_token_with_func_ptr(regex, Rc::new(constr))
44 }
45
46 pub fn register_token_with_regex_str<C: 'static + Fn(&str) -> Atom>(&mut self, regex: &str, constr: C) {
47 let regex = Regex::new(regex).unwrap();
48 self.register_token(regex, constr)
49 }
50
51 pub fn register_function<T: 'static + GroundedFunction>(&mut self, func: GroundedFunctionAtom<T>) {
52 let regex = Regex::new(func.name()).unwrap();
53 let constr = move |_token: &str| -> Atom { Atom::gnd(func.clone()) };
54 self.register_token(regex, constr)
55 }
56
57 pub fn move_front(&mut self, from: &mut Tokenizer) {
62 from.move_back(self);
63 self.move_back(from);
64 }
65
66 pub fn move_back(&mut self, from: &mut Tokenizer) {
71 self.tokens.append(&mut from.tokens);
72 }
73
74 pub fn find_token(&self, token: &str) -> Option<&AtomConstr> {
75 self.tokens.iter().rfind(|descr| {
76 match descr.regex.find_at(token, 0) {
77 Some(m) => m.start() == 0 && m.end() == token.len(),
78 None => false,
79 }
80 }).map(|descr| &*(descr.constr))
81 }
82
83 pub(crate) fn register_token_with_func_ptr(&mut self, regex: Regex, constr: Rc<AtomConstr>) {
85 self.tokens.push(TokenDescr{ regex, constr: constr })
86 }
87
88 pub(crate) fn find_exact(&self, regex_str: &str) -> Option<Rc<AtomConstr>> {
91 self.tokens.iter().rfind(|descr| {
92 descr.regex.as_str() == regex_str
93 }).map(|descr| descr.constr.clone())
94 }
95
96 pub fn remove_token(&mut self, regex_str: &str) {
97 if let Some(pos) = self.tokens.iter().position(|descr| descr.regex.as_str() == regex_str) {
98 self.tokens.remove(pos);
99 }
100 }
101
102}
103
104#[cfg_attr(test, derive(PartialEq))]
106#[derive(Clone, Copy, Debug)]
107pub enum SyntaxNodeType {
108 Comment,
110 VariableToken,
112 StringToken,
114 WordToken,
117 OpenParen,
119 CloseParen,
121 Whitespace,
123 LeftoverText,
125 ExpressionGroup,
128 ErrorGroup,
131}
132
133impl SyntaxNodeType {
134 pub fn is_leaf(&self) -> bool {
137 match self {
138 Self::ExpressionGroup |
139 Self::ErrorGroup => false,
140 _ => true
141 }
142 }
143}
144
145#[cfg_attr(test, derive(PartialEq))]
146#[derive(Clone, Debug)]
147pub struct SyntaxNode {
148 pub node_type: SyntaxNodeType,
149 pub src_range: Range<usize>,
150 pub sub_nodes: Vec<SyntaxNode>,
151 pub parsed_text: Option<String>,
152 pub message: Option<String>,
153 pub is_complete: bool,
154}
155
156impl SyntaxNode {
157 fn new(node_type: SyntaxNodeType, src_range: Range<usize>, sub_nodes: Vec<SyntaxNode>) -> SyntaxNode {
158 Self {
159 node_type,
160 src_range,
161 parsed_text: None,
162 sub_nodes,
163 message: None,
164 is_complete: true
165 }
166 }
167
168 fn new_token_node(node_type: SyntaxNodeType, src_range: Range<usize>, parsed_text: String) -> SyntaxNode {
169 let mut node = SyntaxNode::new(node_type, src_range, vec![]);
170 node.parsed_text = Some(parsed_text);
171 node
172 }
173
174 fn incomplete_with_message(node_type: SyntaxNodeType, src_range: Range<usize>, sub_nodes: Vec<SyntaxNode>, message: String) -> SyntaxNode {
175 let mut node = SyntaxNode::new(node_type, src_range, sub_nodes);
176 node.message = Some(message);
177 node.is_complete = false;
178 node
179 }
180
181 fn new_error_group(src_range: Range<usize>, sub_nodes: Vec<SyntaxNode>) -> SyntaxNode {
183 let message = sub_nodes[sub_nodes.len()-1].message.clone();
184 let mut node = SyntaxNode::new(SyntaxNodeType::ErrorGroup, src_range, sub_nodes);
185 node.message = message;
186 node.is_complete = false;
187 node
188 }
189
190 pub fn as_atom(&self, tokenizer: &Tokenizer) -> Result<Option<Atom>, String> {
192
193 if !self.is_complete {
195 return Err(self.message.clone().unwrap())
196 }
197
198 match self.node_type {
199 SyntaxNodeType::Comment |
200 SyntaxNodeType::Whitespace => Ok(None),
201 SyntaxNodeType::OpenParen |
202 SyntaxNodeType::CloseParen => Ok(None),
203 SyntaxNodeType::VariableToken => {
204 let token_text = self.parsed_text.as_ref().unwrap();
205 let new_var_atom = Atom::var(token_text);
206 Ok(Some(new_var_atom))
207 },
208 SyntaxNodeType::StringToken |
209 SyntaxNodeType::WordToken => {
210 let token_text = self.parsed_text.as_ref().unwrap();
211 let constr = tokenizer.find_token(token_text);
212 if let Some(constr) = constr {
213 let new_atom = constr(token_text)
214 .map_err(|e| format!("byte range = ({:?}) | {e}", self.src_range))?;
215 Ok(Some(new_atom))
216 } else {
217 let new_atom = Atom::sym(token_text);
218 Ok(Some(new_atom))
219 }
220 },
221 SyntaxNodeType::ExpressionGroup => {
222 let mut err_encountered = Ok(());
223 let expr_children: Vec<Atom> = self.sub_nodes.iter().filter_map(|node| {
224 match node.as_atom(tokenizer) {
225 Err(err) => {
226 err_encountered = Err(err);
227 None
228 },
229 Ok(atom) => atom
230 }
231 }).collect();
232 match err_encountered {
233 Ok(_) => {
234 let new_expr_atom = Atom::expr(expr_children);
235 Ok(Some(new_expr_atom))
236 },
237 Err(err) => Err(err)
238 }
239 },
240 SyntaxNodeType::LeftoverText |
241 SyntaxNodeType::ErrorGroup => {unreachable!()}
242 }
243 }
244
245 pub fn visit_depth_first<C>(&self, mut callback: C)
247 where C: FnMut(&SyntaxNode)
248 {
249 self.visit_depth_first_internal(&mut callback);
250 }
251
252 fn visit_depth_first_internal<C>(&self, callback: &mut C)
253 where C: FnMut(&SyntaxNode)
254 {
255 for sub_node in self.sub_nodes.iter() {
256 sub_node.visit_depth_first_internal(callback);
257 }
258 callback(self);
259 }
260}
261
262pub trait Parser {
265 fn next_atom(&mut self, tokenizer: &Tokenizer) -> Result<Option<Atom>, String>;
266}
267
268impl<R: Iterator<Item=io::Result<char>>> Parser for SExprParser<R> {
269 fn next_atom(&mut self, tokenizer: &Tokenizer) -> Result<Option<Atom>, String> {
270 self.parse(tokenizer)
271 }
272}
273
274impl Parser for &mut (dyn Parser + '_) {
275 fn next_atom(&mut self, tokenizer: &Tokenizer) -> Result<Option<Atom>, String> {
276 (**self).next_atom(tokenizer)
277 }
278}
279
280type CharReaderItem = (usize, io::Result<char>);
281
282pub struct CharReader<R: Iterator<Item=io::Result<char>>> {
286 chars: R,
287 idx: usize,
288 next: (Option<CharReaderItem>, usize),
289}
290
291impl<R: Iterator<Item=io::Result<char>>> CharReader<R> {
292 pub fn new(chars: R) -> Self {
294 let mut s = Self{ chars, idx: 0, next: (None, 0) };
295 s.next = s.next_internal();
296 s
297 }
298
299 pub fn last_idx(&self) -> usize {
300 self.idx
301 }
302
303 pub fn peek(&mut self) -> Option<&CharReaderItem> {
304 self.next.0.as_ref()
305 }
306
307 pub fn next(&mut self) -> Option<CharReaderItem> {
308 self.idx += self.next.1;
309 let next = self.next_internal();
310 std::mem::replace(&mut self.next, next).0
311 }
312
313 pub fn next_internal(&mut self) -> (Option<CharReaderItem>, usize) {
314 let c = self.chars.next();
315 match c {
316 Some(r @ Ok(c)) => (Some((self.idx, r)), c.len_utf8()),
317 Some(Err(e)) => (Some((self.idx, Err(e))), 0),
318 None => (None, 0),
319 }
320 }
321}
322
323impl<R: Iterator<Item=io::Result<char>>> Iterator for CharReader<R> {
324 type Item = io::Result<char>;
325
326 fn next(&mut self) -> Option<Self::Item> {
327 self.next().map(|(_, c)| c)
328 }
329}
330
331impl<R: Read> From<R> for CharReader<CodePoints<std::io::Bytes<R>>> {
332 fn from(input: R) -> Self {
333 let chars: CodePoints<_> = input.bytes().into();
334 Self::new(chars)
335 }
336}
337
338impl<R: Iterator<Item=io::Result<u8>>> From<R> for CharReader<CodePoints<R>> {
339 fn from(input: R) -> Self {
340 let chars: CodePoints<_> = input.into();
341 Self::new(chars)
342 }
343}
344
345impl<'a> From<&'a str> for CharReader<std::iter::Map<std::str::Chars<'a>, fn(char) -> io::Result<char>>> {
346 fn from(text: &'a str) -> Self {
347 Self::new(text.chars().map(Ok))
348 }
349}
350
351impl From<String> for CharReader<std::iter::Map<std::vec::IntoIter<char>, fn(char) -> io::Result<char>>> {
352 fn from(text: String) -> Self {
353 let chars: Vec<char> = text.chars().collect();
354 Self::new(chars.into_iter().map(Ok))
355 }
356}
357
358impl<'a> From<&'a String> for CharReader<std::iter::Map<std::str::Chars<'a>, fn(char) -> io::Result<char>>> {
359 fn from(text: &'a String) -> Self {
360 Self::new(text.chars().map(Ok))
361 }
362}
363
364pub struct SExprParser<R: Iterator<Item=io::Result<char>>> {
369 it: CharReader<R>,
370}
371
372impl<R: Iterator<Item=io::Result<char>>> SExprParser<R> {
373
374 pub fn new<I: Into<CharReader<R>>>(chars: I) -> Self {
375 Self{ it: chars.into() }
376 }
377
378 pub fn parse(&mut self, tokenizer: &Tokenizer) -> Result<Option<Atom>, String> {
379 loop {
380 match self.parse_to_syntax_tree()? {
381 Some(node) => {
382 if let Some(atom) = node.as_atom(tokenizer)? {
383 return Ok(Some(atom))
384 }
385 },
386 None => {
387 return Ok(None);
388 },
389 }
390 }
391 }
392
393 fn peek(&mut self) -> Result<Option<(usize, char)>, String> {
394 match self.it.peek() {
395 Some(&(idx, Ok(c))) => Ok(Some((idx, c))),
396 None => Ok(None),
397 Some((idx, Err(err))) => Err(format!("Input read error at position {}: {}", idx, err)),
398 }
399 }
400
401 fn next(&mut self) -> Result<Option<(usize, char)>, String> {
402 match self.it.next() {
403 Some((idx, Ok(c))) => Ok(Some((idx, c))),
404 None => Ok(None),
405 Some((idx, Err(err))) => Err(format!("Input read error at position {}: {}", idx, err)),
406 }
407 }
408
409 fn skip_next(&mut self) {
410 self.it.next();
411 }
412
413 pub fn parse_to_syntax_tree(&mut self) -> Result<Option<SyntaxNode>, String> {
414 if let Some((idx, c)) = self.peek()? {
415 match c {
416 ';' => {
417 return self.parse_comment();
418 },
419 _ if c.is_whitespace() => {
420 let whispace_node = SyntaxNode::new(SyntaxNodeType::Whitespace, idx..idx+1, vec![]);
421 self.skip_next();
422 return Ok(Some(whispace_node));
423 },
424 '$' => {
425 return self.parse_variable().map(Some);
426 },
427 '(' => {
428 return self.parse_expr().map(Some);
429 },
430 ')' => {
431 let close_paren_node = SyntaxNode::new(SyntaxNodeType::CloseParen, idx..idx+1, vec![]);
432 self.skip_next();
433 let leftover_text_node = self.parse_leftovers(idx + 1, "Unexpected right bracket".to_string())?;
434 let error_group_node = SyntaxNode::new_error_group(idx..self.cur_idx(), vec![close_paren_node, leftover_text_node]);
435 return Ok(Some(error_group_node));
436 },
437 _ => {
438 return self.parse_token();
439 },
440 };
441 }
442 Ok(None)
443 }
444
445 fn cur_idx(&mut self) -> usize {
447 self.it.last_idx()
448 }
449
450 fn parse_comment(&mut self) -> Result<Option<SyntaxNode>, String> {
452 if let Some((start_idx, _c)) = self.peek()? {
453 while let Some((_idx, c)) = self.peek()? {
454 match c {
455 '\n' => break,
456 _ => self.skip_next(),
457 }
458 }
459 let range = start_idx..self.cur_idx();
460 Ok(Some(SyntaxNode::new(SyntaxNodeType::Comment, range, vec![])))
461 } else {
462 Ok(None)
463 }
464 }
465
466 fn parse_leftovers(&mut self, start_idx: usize, message: String) -> Result<SyntaxNode, String> {
467 while let Some(_) = self.next()? {}
468 let range = start_idx..self.cur_idx();
469 Ok(SyntaxNode::incomplete_with_message(SyntaxNodeType::LeftoverText, range, vec![], message))
470 }
471
472 fn parse_expr(&mut self) -> Result<SyntaxNode, String> {
473 let start_idx = self.cur_idx();
474 let mut child_nodes: Vec<SyntaxNode> = Vec::new();
475
476 let open_paren_node = SyntaxNode::new(SyntaxNodeType::OpenParen, start_idx..start_idx+1, vec![]);
477 child_nodes.push(open_paren_node);
478 self.skip_next();
479
480 while let Some((idx, c)) = self.peek()? {
481 match c {
482 ';' => {
483 let comment_node = self.parse_comment()?.unwrap();
484 child_nodes.push(comment_node);
485 },
486 _ if c.is_whitespace() => {
487 let whitespace_node = SyntaxNode::new(SyntaxNodeType::Whitespace, idx..idx+1, vec![]);
488 child_nodes.push(whitespace_node);
489 self.skip_next();
490 },
491 ')' => {
492 let close_paren_node = SyntaxNode::new(SyntaxNodeType::CloseParen, idx..idx+1, vec![]);
493 child_nodes.push(close_paren_node);
494 self.skip_next();
495
496 let expr_node = SyntaxNode::new(SyntaxNodeType::ExpressionGroup, start_idx..self.cur_idx(), child_nodes);
497 return Ok(expr_node);
498 },
499 _ => {
500 if let Some(parsed_node) = self.parse_to_syntax_tree()? {
501 let is_err = !parsed_node.is_complete;
502 child_nodes.push(parsed_node);
503
504 if is_err {
506 let error_group_node = SyntaxNode::new_error_group(start_idx..self.cur_idx(), child_nodes);
507 return Ok(error_group_node);
508 }
509 } else {
510 let leftover_node = SyntaxNode::incomplete_with_message(SyntaxNodeType::ErrorGroup, start_idx..self.cur_idx(), child_nodes, "Unexpected end of expression member".to_string());
511 return Ok(leftover_node);
512 }
513 },
514 }
515 }
516 let leftover_node = SyntaxNode::incomplete_with_message(SyntaxNodeType::ErrorGroup, start_idx..self.cur_idx(), child_nodes, "Unexpected end of expression".to_string());
517 Ok(leftover_node)
518 }
519
520 fn parse_token(&mut self) -> Result<Option<SyntaxNode>, String> {
521 match self.peek()? {
522 Some((_idx, '"')) => {
523 let string_node = self.parse_string()?;
524 Ok(Some(string_node))
525 },
526 Some((_idx, _)) => {
527 let word_node = self.parse_word()?;
528 Ok(Some(word_node))
529 },
530 None => Ok(None)
531 }
532 }
533
534 fn parse_string(&mut self) -> Result<SyntaxNode, String> {
535 let mut token = String::new();
536 let start_idx = self.cur_idx();
537
538 if let Some((_idx, '"')) = self.next()? {
539 token.push('"');
540 } else {
541 let leftover_text_node = SyntaxNode::incomplete_with_message(SyntaxNodeType::LeftoverText, start_idx..self.cur_idx(), vec![], "Double quote expected".to_string());
542 return Ok(leftover_text_node);
543 }
544 while let Some((char_idx, c)) = self.next()? {
545 if c == '"' {
546 token.push('"');
547 let string_node = SyntaxNode::new_token_node(SyntaxNodeType::StringToken, start_idx..self.cur_idx(), token);
548 return Ok(string_node);
549 }
550 if c == '\\' {
551 let escape_err = |cur_idx| { SyntaxNode::incomplete_with_message(SyntaxNodeType::StringToken, char_idx..cur_idx, vec![], "Invalid escape sequence".to_string()) };
552
553 match self.next()? {
554 Some((_idx, c)) => {
555 let val = match c {
556 '\'' | '\"' | '\\' => c, 'n' => '\n', 'r' => '\r', 't' => '\t', 'x' => { match self.parse_2_digit_radix_char(16)? {
562 Some(c) => c,
563 None => {return Ok(escape_err(self.cur_idx())); }
564 }
565 },
566 'u' => { match self.parse_unicode_sequence()? {
568 Some(c) => c,
569 None => { return Ok(escape_err(self.cur_idx()));
570 }
571 }}
572 _ => {
573 return Ok(escape_err(self.cur_idx()));
574 }
575 };
576 token.push(val);
577 },
578 None => {
579 let leftover_text_node = SyntaxNode::incomplete_with_message(SyntaxNodeType::StringToken, start_idx..self.cur_idx(), vec![], "Escaping sequence is not finished".to_string());
580 return Ok(leftover_text_node);
581 },
582 }
583 } else {
584 token.push(c);
585 }
586 }
587 let unclosed_string_node = SyntaxNode::incomplete_with_message(SyntaxNodeType::StringToken, start_idx..self.cur_idx(), vec![], "Unclosed String Literal".to_string());
588 Ok(unclosed_string_node)
589 }
590
591 fn parse_2_digit_radix_char(&mut self, radix: u32) -> Result<Option<char>, String> {
593 let high = self.next()?.and_then(|(_, c)| c.to_digit(radix));
594 let high = match high {
595 Some(high) => high << 4,
596 None => return Ok(None),
597 };
598 let low = self.next()?.and_then(|(_, c)| c.to_digit(radix));
599 let low = match low {
600 Some(low) => low,
601 None => return Ok(None),
602 };
603 let hex = high | low;
604 if hex <= 0x7F {
605 Ok(char::from_u32(hex))
606 } else {
607 Ok(None)
608 }
609 }
610
611 fn parse_unicode_sequence(&mut self) -> Result<Option<char>, String> {
612 let mut result_u32: u32 = 0;
614 let mut cur_idx = 0;
615 match self.next()? {
616 Some((_, '{')) => (),
617 _ => return Ok(None),
618 };
619 loop
620 {
621 let next_char = match self.next()? {
622 Some(char_val) => char_val.1,
623 None => return Ok(None),
624 };
625 if next_char == '{' {continue}
626 if next_char == '}' {break}
627 cur_idx += 1;
628 if cur_idx > 8 { return Ok(None) }
629 let char_to_digit = match next_char.to_digit(16) {
630 Some(digit) => digit,
631 None => return Ok(None),
632 };
633 result_u32 = (result_u32 << 4) | char_to_digit;
634 }
635 Ok(char::from_u32(result_u32))
636 }
637
638 fn parse_word(&mut self) -> Result<SyntaxNode, String> {
639 let mut token = String::new();
640 let start_idx = self.cur_idx();
641
642 while let Some((_idx, c)) = self.peek()? {
643 if c.is_whitespace() || c == '(' || c == ')' || c == ';' {
644 break;
645 }
646 token.push(c);
647 self.skip_next();
648 }
649
650 let word_node = SyntaxNode::new_token_node(SyntaxNodeType::WordToken, start_idx..self.cur_idx(), token);
651 Ok(word_node)
652 }
653
654 fn parse_variable(&mut self) -> Result<SyntaxNode, String> {
655 let (start_idx, _c) = self.peek()?.unwrap();
656 self.skip_next();
657
658 let mut token = String::new();
659 while let Some((_idx, c)) = self.peek()? {
660 if c.is_whitespace() || c == '(' || c == ')' {
661 break;
662 }
663 if c == '#' {
664 let leftover_node = self.parse_leftovers(start_idx, "'#' char is reserved for internal usage".to_string());
665 return leftover_node;
666 }
667 token.push(c);
668 self.skip_next();
669 }
670 let var_token_node = SyntaxNode::new_token_node(SyntaxNodeType::VariableToken, start_idx..self.cur_idx(), token);
671 Ok(var_token_node)
672 }
673
674}
675
676impl Parser for &[Atom] {
677 fn next_atom(&mut self, _tokenizer: &Tokenizer) -> Result<Option<Atom>, String> {
678 if let Some((atom, rest)) = self.split_first() {
679 *self = rest;
680 Ok(Some(atom.clone()))
681 } else {
682 Ok(None)
683 }
684 }
685}
686
687#[cfg(test)]
688pub(crate) fn metta_atom(atom_str: &str) -> Atom {
689 let mut parser = SExprParser::new(atom_str);
690 let atom = parser.parse(&Tokenizer::new()).unwrap().expect("Single atom is expected");
691 atom
692}
693
694#[cfg(test)]
695mod tests {
696 use super::*;
697
698 #[test]
699 fn test_text_var() {
700 assert_eq!(vec![expr!(n)], parse_atoms("$n"));
701 }
702
703 #[test]
704 fn test_text_sym() {
705 assert_eq!(vec![expr!("test")], parse_atoms("test"));
706 }
707
708 #[test]
709 fn test_text_quoted_string() {
710 assert_eq!(vec![expr!("\"te st\"")], parse_atoms("\"te st\""));
711 }
712
713 #[test]
714 fn test_text_escape_chars() {
715 assert_eq!(vec![expr!("\"test\ttab\"")], parse_atoms(r#""test\ttab""#));
717 assert_eq!(vec![expr!("\"test\nnewline\"")], parse_atoms(r#""test\nnewline""#));
719 assert_eq!(vec![expr!("\"\x1b[1;32m> \x1b[0m\"")], parse_atoms(r#""\x1b[1;32m> \x1b[0m""#));
721 assert_eq!(vec![expr!("\"test\"quote\"")], parse_atoms(r#""test\"quote""#));
723 assert_eq!(vec![expr!("\"test\x7Fmax\"")], parse_atoms(r#""test\x7fmax""#));
725 assert!(parse_atoms(r#""test\xFF""#).len() == 0);
727 }
728
729 #[test]
730 fn test_text_recognize_full_token() {
731 let mut tokenizer = Tokenizer::new();
732 tokenizer.register_token(Regex::new(r"b").unwrap(),
733 |_| Atom::value("b"));
734
735 let mut parser = SExprParser::new("ab");
736
737 assert_eq!(Ok(Some(expr!("ab"))), parser.parse(&tokenizer));
738 assert_eq!(Ok(None), parser.parse(&tokenizer));
739 }
740
741 #[test]
742 fn test_text_gnd() {
743 let mut tokenizer = Tokenizer::new();
744 tokenizer.register_token(Regex::new(r"\d+").unwrap(),
745 |token| Atom::value(token.parse::<i32>().unwrap()));
746
747 let mut parser = SExprParser::new("(3d 42)");
748
749 assert_eq!(Ok(Some(expr!("3d" {42}))), parser.parse(&tokenizer));
750 assert_eq!(Ok(None), parser.parse(&tokenizer));
751 }
752
753 #[test]
754 fn test_text_expr() {
755 assert_eq!(vec![expr!("=" ("fac" n) ("*" n ("fac" ("-" n "1"))))],
756 parse_atoms("(= (fac $n) (* $n (fac (- $n 1))))"));
757 }
758
759 #[test]
760 fn test_text_few_expr() {
761 assert_eq!(vec![expr!(("a")), expr!(("b"))],
762 parse_atoms("(a) (b)"));
763 }
764
765 #[test]
766 fn test_next_token() {
767 let text = "n)";
768 let mut parser = SExprParser::new(text);
769
770 let node = parser.parse_token().unwrap().unwrap();
771 assert_eq!("n".to_string(), text[node.src_range]);
772 assert_eq!(Ok(Some((1, ')'))), parser.next());
773
774 let text = "n;Some comments.";
775 let mut parser = SExprParser::new(text);
776
777 let node = parser.parse_token().unwrap().unwrap();
778 assert_eq!("n".to_string(), text[node.src_range]);
779 assert_eq!(Ok(Some((1, ';'))), parser.next());
780 }
781
782 #[test]
783 fn test_next_string_errors() {
784 let mut parser = SExprParser::new("a");
785 let node = parser.parse_string().unwrap();
786 assert!(!node.is_complete);
787 assert_eq!("Double quote expected", node.message.unwrap());
788
789 let mut parser = SExprParser::new("\"\\");
790 let node = parser.parse_string().unwrap();
791 assert!(!node.is_complete);
792 assert_eq!("Escaping sequence is not finished", node.message.unwrap());
793 }
794
795 #[test]
796 fn test_non_ascii() {
797 let text = "ж)";
798 let mut parser = SExprParser::new(text);
799 let node = parser.parse_token().unwrap().unwrap();
800 assert_eq!("ж".to_string(), text[node.src_range]);
801 assert_eq!(Ok(Some((2, ')'))), parser.next());
802
803 let text = "⟮;Some comments.";
804 let mut parser = SExprParser::new(text);
805 let node = parser.parse_token().unwrap().unwrap();
806 assert_eq!("⟮".to_string(), text[node.src_range]);
807 assert_eq!(Ok(Some((3, ';'))), parser.next());
808
809 let text = "⟮";
810 let mut parser = SExprParser::new(text);
811 let node = parser.parse_token().unwrap().unwrap();
812 assert_eq!("⟮".to_string(), text[node.src_range]);
813 assert_eq!(Ok(None), parser.next());
814 }
815
816 #[test]
817 fn test_parse_unicode() {
818 let mut parser = SExprParser::new("\"\\u{0123}\"");
819 assert_eq!(Ok(SyntaxNode::new_token_node(SyntaxNodeType::StringToken, 0..10, "\"\u{0123}\"".into())), parser.parse_string());
820
821 let mut parser = SExprParser::new("\"\\u{0123\"");
822 let node = parser.parse_string().unwrap();
823 assert!(!node.is_complete);
824 assert_eq!("Invalid escape sequence", node.message.unwrap());
825
826 let mut parser = SExprParser::new("\"\\u0123}\"");
827 let node = parser.parse_string().unwrap();
828 assert!(!node.is_complete);
829 assert_eq!("Invalid escape sequence", node.message.unwrap());
830
831 let mut parser = SExprParser::new("\"\\u0123\"");
832 let node = parser.parse_string().unwrap();
833 assert!(!node.is_complete);
834 assert_eq!("Invalid escape sequence", node.message.unwrap());
835
836 let mut parser = SExprParser::new("\"\\u0{123}\"");
837 let node = parser.parse_string().unwrap();
838 assert!(!node.is_complete);
839 assert_eq!("Invalid escape sequence", node.message.unwrap());
840
841 let mut parser = SExprParser::new("\"\\u{defg}\"");
842 let node = parser.parse_string().unwrap();
843 assert!(!node.is_complete);
844 assert_eq!("Invalid escape sequence", node.message.unwrap());
845
846 let mut parser = SExprParser::new("\"\\u{130AC}\"");
847 assert_eq!(Ok(SyntaxNode::new_token_node(SyntaxNodeType::StringToken, 0..11, "\"\u{130AC}\"".into())), parser.parse_string());
848
849 let mut parser = SExprParser::new("\"\\u{130ac}\"");
850 assert_eq!(Ok(SyntaxNode::new_token_node(SyntaxNodeType::StringToken, 0..11, "\"\u{130ac}\"".into())), parser.parse_string());
851
852 let mut parser = SExprParser::new("\"\\u{012345678\"");
853 let node = parser.parse_string().unwrap();
854 assert!(!node.is_complete);
855 assert_eq!("Invalid escape sequence", node.message.unwrap());
856
857 }
858
859 #[test]
860 fn test_unbalanced_brackets() {
861 let mut parser = SExprParser::new("(a)))");
862 let _ = parser.parse(&Tokenizer::new());
863 assert_eq!(Err(String::from("Unexpected right bracket")), parser.parse(&Tokenizer::new()));
864 }
865
866 #[test]
867 fn test_error_from_tokenizer() {
868 let mut tokenizer = Tokenizer::new();
873 tokenizer.register_fallible_token(Regex::new(r"[\-\+]?\d+.\d+").unwrap(),
874 |token| Ok(Atom::gnd(hyperon_atom::gnd::number::Number::from_float_str(token)?))
875 );
876 let mut parser = SExprParser::new("12345678901234567:8901234567890");
877 assert!(parser.parse(&tokenizer).is_err());
878 }
879
880 #[test]
881 fn test_comment_base() {
882 let program = ";(a 4)
883 (b 5)";
884 let expected = vec![expr!("b" "5")];
885 let res = parse_atoms(program);
886 assert_eq!(res, expected);
887 }
888
889 #[test]
890 fn test_comment_in_sexpr() {
891 let program = " (a ; 4)
892 5)";
893 let expected = vec![expr!("a" "5")];
894 let res = parse_atoms(program);
895 assert_eq!(res, expected);
896 }
897
898 #[test]
899 fn test_comment_in_sexpr_before_closing_bracket() {
900 let program = " (a 5 ; 4)
901 )";
902 let expected = vec![expr!("a" "5")];
903 let res = parse_atoms(program);
904 assert_eq!(res, expected);
905 }
906
907 #[test]
908 fn test_comment_endl() {
909 let program = " (a 4);
910 (b 5)";
911 let expected = vec![expr!("a" "4"), expr!("b" "5")];
912 let res = parse_atoms(program);
913 assert_eq!(res, expected);
914 }
915
916 fn parse_atoms(program: &str) -> Vec<Atom> {
917 let tokenizer = Tokenizer::new();
918 let mut parser = SExprParser::new(program);
919 let mut result = Vec::new();
920 while let Ok(Some(atom)) = parser.parse(&tokenizer) {
921 result.push(atom);
922 }
923 result
924 }
925
926 #[test]
927 fn test_lattice_in_var_name() {
928 let mut parser = SExprParser::new("$a#");
929 assert_eq!(Err(String::from("'#' char is reserved for internal usage")), parser.parse(&Tokenizer::new()));
930 }
931
932 #[test]
933 fn override_token_definition() {
934 let mut tokenizer = Tokenizer::new();
935 tokenizer.register_token(Regex::new(r"A").unwrap(), |_| Atom::sym("A"));
936 assert_eq!(tokenizer.find_token("A").unwrap()("A").unwrap(), Atom::sym("A"));
937 tokenizer.register_token(Regex::new(r"A").unwrap(), |_| Atom::sym("B"));
938 assert_eq!(tokenizer.find_token("A").unwrap()("A").unwrap(), Atom::sym("B"));
939 }
940
941 #[test]
942 fn test_owned_sexprparser() {
943 let tokenizer = Tokenizer::new();
944 let mut parser;
945 {
946 let owned = r#"One (two 3) "four""#.to_owned();
947 parser = SExprParser::new(owned);
948 }
949 let mut results: Vec<Atom> = vec![];
950 while let Ok(Some(atom)) = parser.next_atom(&tokenizer) {
951 results.push(atom);
952 }
953 let expected = vec![sym!("One"), expr!("two" "3"), sym!(r#""four""#)];
954 assert_eq!(results, expected);
955 }
956
957 #[test]
958 fn test_comment_in_symbol() {
959 let program = "(a; 4)
960 5)";
961 let expected = vec![expr!("a" "5")];
962 let res = parse_atoms(program);
963 assert_eq!(res, expected);
964 }
965}