feat: support for chars

viddrobnic · Jun 16, 2024 · 948e4aa · 948e4aa
1 parent 97da460
commit 948e4aa
Show file tree

Hide file tree

Showing 14 changed files with 250 additions and 21 deletions.
diff --git a/examples/aoc_day_01.aoc b/examples/aoc_day_01.aoc
@@ -13,21 +13,23 @@ for (line = input(); line; line = input()) {
 // Part one
 res = 0
 for (i = 0; i < len(data); i = i + 1) {
-    chars = split(data[i], "")
+    line = data[i]
     n = 0
 
     // First number
-    for (j = 0; j < len(chars); j = j + 1) {
-        if (int(chars[j])) {
-            n = int(chars[j])
+    for (j = 0; j < len(line); j = j + 1) {
+        c = int(line[j]) - int('0')
+        if (c >= 0 & c < 10) {
+            n = c
             break
         }
     }
 
     // Last number
-    for (j = len(chars) - 1; j >= 0; j = j - 1) {
-        if (int(chars[j])) {
-            n = n * 10 + int(chars[j])
+    for (j = len(line) - 1; j >= 0; j = j - 1) {
+        c = int(line[j]) - int('0')
+        if (c >= 0 & c < 10) {
+            n = n * 10 + c
             break
         }
     }
@@ -38,13 +40,13 @@ for (i = 0; i < len(data); i = i + 1) {
 print("Part one: " + str(res))
 
 // Part two
-substr_is = fn(target_ch, position, lookup_ch) {
-    if (position + len(lookup_ch) > len(target_ch)) {
+substr_is = fn(target, position, lookup) {
+    if (position + len(lookup) > len(target)) {
         return false
     }
 
-    for (i = 0; i < len(lookup_ch); i = i + 1) {
-        if (target_ch[position + i] != lookup_ch[i]) {
+    for (i = 0; i < len(lookup); i = i + 1) {
+        if (target[position + i] != lookup[i]) {
             return false
         }
     }
@@ -53,13 +55,14 @@ substr_is = fn(target_ch, position, lookup_ch) {
 }
 
 digits = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"]
-digit = fn(chars, position) {
-    if (int(chars[position])) {
-        return int(chars[position])
+digit = fn(string, position) {
+    d = int(string[position]) - int('0')
+    if (d >= 0 & d < 10) {
+        return d
     }
-
+    
     for (d = 0; d < len(digits); d = d + 1) {
-        if (substr_is(chars, position, split(digits[d], ""))) {
+        if (substr_is(string, position, digits[d])) {
             return d + 1
         }
     }
@@ -68,12 +71,12 @@ digit = fn(chars, position) {
 
 res = 0
 for (i = 0; i < len(data); i = i + 1) {
-    chars = split(data[i], "")
+    line = data[i]
     n = 0
 
     // First number
-    for (j = 0; j < len(chars); j = j + 1) {
-        d = digit(chars, j)
+    for (j = 0; j < len(line); j = j + 1) {
+        d = digit(line, j)
         if (d) {
             n = d
             break
@@ -82,8 +85,8 @@ for (i = 0; i < len(data); i = i + 1) {
     }
 
     // Last number
-    for (j = len(chars) - 1; j >= 0; j = j - 1) {
-        d = digit(chars, j)
+    for (j = len(line) - 1; j >= 0; j = j - 1) {
+        d = digit(line, j)
         if (d) {
             n = n * 10 + d
             break

diff --git a/examples/strings.aoc b/examples/strings.aoc
@@ -0,0 +1,45 @@
+// AoC lang support utf8, so we can do cool stuff like this:
+string = "🚗"
+print(string) // 🚗
+
+// But it makes some things counter intuitive:
+print(len(string)) // 4
+
+// The behavior is similar to that of rust strings, so you can read
+// all about it here: https://doc.rust-lang.org/std/string/struct.String.html#utf-8
+
+// TL;DR of the above article is: strings are always utf8, which means that one 
+// graphene can span multiple characters. In AoC lang, char is a single byte.
+// This makes everything work nicely if you are using just ascii strings,
+// but you have to be careful when using other chars.
+
+// Let's take a look at some examples. You can construct a single char as:
+ch = 'A'
+print(ch) // A
+
+// And convert it to string with builtin str
+string = str(ch)
+print(string) // A
+
+// Length returns number of bytes, which as we saw above is not the same as number of
+// graphenes
+print(len("AB")) // 2
+print(len("🚗")) // 4
+
+// We can use index notation to get a specific char from the string:
+print("AB"[0]) // A
+
+// If index is out of bounds, null is returned
+print("AB"[5]) // null
+
+// Since char is a single byte, not a graphene, weird stuff can happen:
+print("🚗"[0]) // ð
+
+// We can also convert chars to ints
+print(int('A')) // 65
+
+// And we can also go in the other direction
+print(char(65)) // A
+
+// If int is larger than 256, it just overflows
+print(char(1090)) // B  // explanation: 1090 % 256 = 66, and 66 is 'B'
diff --git a/parser/src/ast.rs b/parser/src/ast.rs
@@ -19,6 +19,7 @@ pub enum NodeValue {
     Identifier(String),
     IntegerLiteral(i64),
     FloatLiteral(f64),
+    CharLiteral(u8),
     BoolLiteral(bool),
     StringLiteral(String),
     ArrayLiteral(Vec<Node>),
@@ -227,6 +228,7 @@ impl Display for NodeValue {
             NodeValue::Identifier(ident) => write!(f, "{ident}"),
             NodeValue::IntegerLiteral(int) => write!(f, "{int}"),
             NodeValue::FloatLiteral(float) => write!(f, "{float}"),
+            NodeValue::CharLiteral(ch) => write!(f, "{}", *ch as char),
             NodeValue::BoolLiteral(boolean) => write!(f, "{boolean}"),
             NodeValue::StringLiteral(string) => write!(f, "\"{string}\""),
             NodeValue::ArrayLiteral(arr) => {

diff --git a/parser/src/error.rs b/parser/src/error.rs
@@ -10,6 +10,7 @@ pub enum ErrorKind {
     UnexpectedEof,
     InvalidEscapeChar(char),
     InvalidChar(char),
+    NonAsciiChar(char),
     InvalidExpression(TokenKind),
     ExpectedEol,
     InvalidNodeKind { expected: NodeKind, got: NodeKind },
@@ -40,6 +41,10 @@ impl Display for ErrorKind {
             ErrorKind::UnexpectedEof => write!(f, "Unexpected end of file"),
             ErrorKind::InvalidEscapeChar(ch) => write!(f, "Invalid escape character '{ch}'"),
             ErrorKind::InvalidChar(ch) => write!(f, "Invalid character '{ch}'"),
+            ErrorKind::NonAsciiChar(ch) => write!(
+                f,
+                "Inalid character literal '{ch}'. Char literals only support ascii."
+            ),
             ErrorKind::InvalidExpression(token) => write!(f, "Not a valid expression: {token}"),
             ErrorKind::ExpectedEol => write!(f, "Expression must end with new line"),
             ErrorKind::InvalidNodeKind { expected, got } => {

diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
@@ -93,6 +93,48 @@ impl<'a> Lexer<'a> {
         }
     }
 
+    fn read_char(&mut self, start_position: Position) -> Result<TokenKind> {
+        let (_, ch) = self.chars.next().ok_or(Error {
+            kind: ErrorKind::UnexpectedEof,
+            range: Range {
+                start: start_position,
+                end: self.position,
+            },
+        })?;
+        self.position.character += ch.len_utf8();
+
+        let (_, end) = self.chars.next().ok_or(Error {
+            kind: ErrorKind::UnexpectedEof,
+            range: Range {
+                start: start_position,
+                end: self.position,
+            },
+        })?;
+        self.position.character += end.len_utf8();
+
+        if end != '\'' {
+            return Err(Error {
+                kind: ErrorKind::InvalidChar(end),
+                range: Range {
+                    start: start_position,
+                    end: self.position,
+                },
+            });
+        }
+
+        if !ch.is_ascii() {
+            return Err(Error {
+                kind: ErrorKind::NonAsciiChar(ch),
+                range: Range {
+                    start: start_position,
+                    end: self.position,
+                },
+            });
+        }
+
+        Ok(TokenKind::Char(ch as u8))
+    }
+
     // Read ident or keywoard, where the first char is at `self.input[start]`
     // and `end` is start + utf8 len of first char
     fn read_ident(&mut self, start: usize, mut end: usize) -> TokenKind {
@@ -249,6 +291,10 @@ impl Iterator for Lexer<'_> {
             '>' => self.peek_parse('=', TokenKind::Geq, TokenKind::Ge),
             '=' => self.peek_parse('=', TokenKind::Eq, TokenKind::Assign),
             '!' => self.peek_parse('=', TokenKind::Neq, TokenKind::Bang),
+            '\'' => match self.read_char(start_position) {
+                Ok(token) => token,
+                Err(err) => return Some(Err(err)),
+            },
             '"' => match self.read_string(start_position) {
                 Ok(token) => token,
                 Err(err) => return Some(Err(err)),
@@ -425,6 +471,7 @@ mod test {
             "normal string" "\n\t\\\""
             // line comment
             false //inline comment
+            'A'
         "#;
 
         let lexer = Lexer::new(input);
@@ -666,6 +713,14 @@ mod test {
                     start: Position::new(9, 34),
                     end: Position::new(10, 0),
                 },
+                Range {
+                    start: Position::new(10, 12),
+                    end: Position::new(10, 15),
+                },
+                Range {
+                    start: Position::new(10, 15),
+                    end: Position::new(11, 0),
+                },
             ]
         );
 
@@ -729,7 +784,24 @@ mod test {
                 TokenKind::False,
                 TokenKind::Comment("inline comment".to_string()),
                 TokenKind::Eol,
+                TokenKind::Char(b'A'),
+                TokenKind::Eol,
             ]
         );
     }
+
+    #[test]
+    fn non_ascii_char() {
+        let mut lexer = Lexer::new("'🚗'");
+        assert_eq!(
+            lexer.next(),
+            Some(Err(Error {
+                kind: ErrorKind::NonAsciiChar('🚗'),
+                range: Range {
+                    start: Position::new(0, 0),
+                    end: Position::new(0, 6),
+                }
+            }))
+        );
+    }
 }
diff --git a/parser/src/parser/mod.rs b/parser/src/parser/mod.rs
@@ -182,6 +182,7 @@ impl Parser<'_> {
             TokenKind::Ident(ident) => (ast::NodeValue::Identifier(ident), range.end),
             TokenKind::Integer(int) => (ast::NodeValue::IntegerLiteral(int), range.end),
             TokenKind::Float(flt) => (ast::NodeValue::FloatLiteral(flt), range.end),
+            TokenKind::Char(ch) => (ast::NodeValue::CharLiteral(ch), range.end),
             TokenKind::True => (ast::NodeValue::BoolLiteral(true), range.end),
             TokenKind::False => (ast::NodeValue::BoolLiteral(false), range.end),
             TokenKind::String(string) => (ast::NodeValue::StringLiteral(string), range.end),

diff --git a/parser/src/parser/test.rs b/parser/src/parser/test.rs
@@ -25,6 +25,7 @@ fn simple_prefix_expression() -> Result<()> {
         "bar"
         break
         continue
+        '~'
     "#;
 
     let program = parse(input)?;
@@ -87,6 +88,13 @@ fn simple_prefix_expression() -> Result<()> {
                     end: Position::new(8, 16)
                 },
             },
+            ast::Node {
+                value: ast::NodeValue::CharLiteral(b'~'),
+                range: Range {
+                    start: Position::new(9, 8),
+                    end: Position::new(9, 11),
+                }
+            },
         ]
     );
 

diff --git a/parser/src/token.rs b/parser/src/token.rs
@@ -7,6 +7,7 @@ pub enum TokenKind {
     Ident(String),
     Integer(i64),
     Float(f64),
+    Char(u8),
     True,
     False,
     String(String),
@@ -104,6 +105,7 @@ impl Display for TokenKind {
             TokenKind::Ident(_) => write!(f, "IDENT"),
             TokenKind::Integer(_) => write!(f, "INTEGER"),
             TokenKind::Float(_) => write!(f, "FLOAT"),
+            TokenKind::Char(_) => write!(f, "CHAR"),
             TokenKind::True => write!(f, "TRUE"),
             TokenKind::False => write!(f, "FALSE"),
             TokenKind::String(_) => write!(f, "STRING"),