Hedron Hedron - 3 months ago 6
C Question

Strange numbers in lexer

I'm wrapping up my lexer and I'm having trouble printing and or correctly assigning numbers to their value. Here is my output, it should be

integer: "10"
, I believe the issue is somewhere around
lexer_num
or
lexer_flt
:
(Including more information as requested.) I don't have or really know what a hex dump is so I don't think I can provide that. To answer your question about why
v
in
lexer_num
is a size_t is because their isn't a need for negative numbers since negative are handled in the parser not the lexer.

Type: "40" {
Line: "1"
Pos: "0"
Num: "2591542"
Real: "0.000000"
Stri: ""
}


code:

#define _CRT_SECURE_NO_WARNINGS
#define DEBUG 0

#include "lexer.h"
#include "error.h"
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <ctype.h>
#include <assert.h>

typedef struct lextoken_t {
const char* str;
token_t type;
} lextoken_t;

static const lextoken_t keywords[] = {
// types
{ "int", _int },
{ "double", _dbl },
{ "void", _void },
{ "char", _char },
{ "string", _str },
{ "bool", _bool },
{ "const", _const },
{ "struct", _struct }
};

/* token_new: creates and returns a new token ptr.
** -lexer: a ptr to the lexer.
** -type: the token type.
*/
token_t* token_new(lexer_t* lexer, tk_type type) {
token_t* token = malloc(sizeof(token_t));
token->line = lexer->line;
token->pos = lexer->pos;
token->type = type;
token->integer = 0;
token->flt = 0;
token->string = NULL;

return token;
}

static void token_print(token_t* token) {
if (token == NULL)
printf("Null token");

printf("Type: \"%i\" { \n", token->type);
printf("\tLine: \"%i\"\n", token->line);
printf("\tPos: \"%i\"\n", token->pos);
printf("\tNum: \"%i\"\n", token->integer);
printf("\tReal: \"%f\"\n", token->flt);
printf("\tStri: \"%s\"\n}\n\n", token->string);
}

/* lexer_look: look at the source (ahead) places infront of the lexer->ptr.
** -lexer: a ptr to the lexer to look ahead in.
** -ahead: how far ahead of the ptr to look.
*/
static char lexer_look(lexer_t* lexer, size_t ahead) {
if (lexer->len < lexer->ptr + ahead) {
error_new(lexer->errors, 0, 0, "The lexer tried to index %d out of bounds %d", lexer->ptr + ahead, lexer->len);
return;
}
return lexer->src[lexer->ptr + ahead];
}

static size_t can_adv(lexer_t* lexer, size_t steps) {
if (lexer->ptr + steps <= lexer->len)
return 1;
else
return 0;
}

/* lexer_adv: moves the lexer->ptr (steps) places.
** -lexer: a ptr to the lexer to look ahead in.
** -steps: how far to advance the ptr.
*/
static char lexer_adv(lexer_t* lexer, size_t steps) {

if (!can_adv(lexer, steps))
error_new(lexer->errors, 0, 0, "The lexer tried to move ptr past bounds %d with value of %d", lexer->len, lexer->ptr + steps);
lexer->ptr += steps;
return lexer->src[lexer->ptr];
}

static void new_line(lexer_t* lexer) {
lexer->line = 0;
lexer->pos = 0;
}

static void lexer_nested(lexer_t* lexer) {
lexer_adv(lexer, 2);
char c = lexer_look(lexer, 0);
size_t depth = 1;

while (depth > 0) {
if (!can_adv(lexer, 1))
error_new(lexer->errors, lexer->line, lexer->pos, "Unterminated block comment.");
else if (c == '*' && lexer_look(lexer, 1) == '#') {
lexer_adv(lexer, 2);
depth--;
} else if (c == '#' && lexer_look(lexer, 1) == '*') {
lexer_adv(lexer, 2);
depth++;
} else
c = lexer_adv(lexer, 1);
}
}

static void lexer_comment(lexer_t* lexer) {
if (lexer_look(lexer, 1) == '*')
lexer_nested(lexer);
else {
char c;
while (((c = lexer_look(lexer, 0)) != '\n') && can_adv(lexer, 1))
lexer_adv(lexer, 1);
new_line(lexer);
}
}

static token_t* lexer_str(lexer_t* lexer) {
size_t str_len = 0;

while (true) {
if (!can_adv(lexer, 1)) {
error_new(lexer->errors, lexer->len, lexer->pos, "Unterminated string.");
return NULL;
}
else if (lexer_look(lexer, 1) == '\"') {
lexer_adv(lexer, 2);
break;
}
else {
lexer_adv(lexer, 1);
str_len++;
}
}

char* string = malloc(str_len + 1);
for (size_t idx = 0; idx < str_len; idx++)
string[idx] = lexer->src[lexer->ptr - str_len + idx];
string[str_len] = '\0';

token_t* token = token_new(lexer, _str);
token->string = string;
return token;
}

static token_t* lexer_float(lexer_t* lexer, token_t* token, size_t v) {
size_t places = 0;
double d = v;

if (!isdigit(lexer_look(lexer, 1))) { return token; }

while (lexer->len > 0) {
char c = lexer_look(lexer, 1);

if (isdigit(c)) {
lexer_adv(lexer, 1);
d = (d * 10) + (c - '0');
places++;
} else
break;
}

token->flt = d / (places * 10);
token->string = "";
return token;
}

static token_t* lexer_num(lexer_t* lexer) {
token_t* token = token_new(lexer, _int);
size_t v = 0;

while (can_adv(lexer, 1)) {
char c = lexer_look(lexer, 0);

if (isdigit(c)) {
v = (v * 10) + (c - '0');
lexer_adv(lexer, 1);
} else if (c == '.') {
lexer_adv(lexer, 1);
return lexer_float(lexer, token, v);
} else {
break;
}
}

token->integer = v;
token->string = "";
return token;
}

static token_t* lexer_ident(lexer_t* lexer) {
token_t* token = token_new(lexer, _ident);
size_t id_len = 0;

while (can_adv(lexer, 1)) {
if (!isalpha(lexer_look(lexer, 0)))
break;

lexer_adv(lexer, 1);
id_len++;
}

char* ident = malloc(id_len + 1);
for (size_t idx = 0; idx < id_len; idx++)
ident[idx] = lexer->src[lexer->ptr - id_len + idx];
ident[id_len] = '\0';

token->string = ident;
return token;
}

static token_t* next_token(lexer_t* lexer) {
token_t* token = NULL;

while (token == NULL && can_adv(lexer, 1)) {
const int c = lexer_look(lexer, 0);

if (DEBUG)
printf("Current character: \"%c\", Length: %d, Pointer: %d \n", lexer_look(lexer, 0), lexer->len, lexer->ptr);

switch (c) {
case '=':
if (lexer_look(lexer, 1) == '=') {
token = token_new(lexer, _eqto);
lexer_adv(lexer, 2);
token->string = "==";
} else {
token = token_new(lexer, _assign);
token->string = "=";
lexer_adv(lexer, 1);
}

break;
case '+':
if (lexer_look(lexer, 1) == '=') {
token = token_new(lexer, _addeql);
lexer_adv(lexer, 2);
token->string = "+=";
} else {
token = token_new(lexer, _add);
token->string = "+";
lexer_adv(lexer, 1);
}

break;
case '-':
if (lexer_look(lexer, 1) == '=') {
token = token_new(lexer, _subeql);
lexer_adv(lexer, 2);
token->string = "-=";
} else {
token = token_new(lexer, _sub);
token->string = "-";
lexer_adv(lexer, 1);
}

break;
case '*':
if (lexer_look(lexer, 1) == '=') {
token = token_new(lexer, _muleql);
lexer_adv(lexer, 2);
token->string = "*=";
} else {
token = token_new(lexer, _mul);
token->string = "*";
lexer_adv(lexer, 1);
}

break;
case '/':
if (lexer_look(lexer, 1) == '=') {
token = token_new(lexer, _diveql);
lexer_adv(lexer, 2);
token->string = "/=";
} else {
token = token_new(lexer, _div);
token->string = "/";
lexer_adv(lexer, 1);
}

break;
case '<':
if (lexer_look(lexer, 1) == '<') {
token = token_new(lexer, _nteq);
lexer_adv(lexer, 2);
token->string = "<=";
} else {
token = token_new(lexer, _bang);
token->string = "<";
lexer_adv(lexer, 1);
}

break;
case '>':
if (lexer_look(lexer, 1) == '<') {
token = token_new(lexer, _nteq);
lexer_adv(lexer, 2);
token->string = ">=";
}
else {
token = token_new(lexer, _bang);
token->string = ">";
lexer_adv(lexer, 1);
}

break;
case '&':
if (lexer_look(lexer, 1) == '&') {
token = token_new(lexer, _and);
lexer_adv(lexer, 2);
token->string = "&&";
} else {
token = token_new(lexer, _notype);
lexer_adv(lexer, 1);
}

break;
case '|':
if (lexer_look(lexer, 1) == '|') {
token = token_new(lexer, _or);
lexer_adv(lexer, 2);
token->string = "||";
}
else {
token = token_new(lexer, _notype);
lexer_adv(lexer, 1);
}

break;
case '%':
token = token_new(lexer, _mod);
token->string = "%";
lexer_adv(lexer, 1);
break;
case '^':
token = token_new(lexer, _mod);
token->string = "^";
lexer_adv(lexer, 1);
break;
case '!':
if (lexer_look(lexer, 1) == '=') {
token = token_new(lexer, _nteq);
lexer_adv(lexer, 2);
token->string = "!=";
}
else {
token = token_new(lexer, _bang);
token->string = "!";
lexer_adv(lexer, 1);
}

break;
case '\"':
token = lexer_str(lexer);
break;
case '#':
lexer_comment(lexer);
break;
case '(':
token = token_new(lexer, _lpara);
token->string = "(";
lexer_adv(lexer, 1);
break;
case ')':
token = token_new(lexer, _rpara);
token->string = ")";
lexer_adv(lexer, 1);
break;
case '{':
token = token_new(lexer, _lcurl);
token->string = "{";
lexer_adv(lexer, 1);
break;
case '}':
token = token_new(lexer, _rcurl);
token->string = "}";
lexer_adv(lexer, 1);
break;
case '[':
token = token_new(lexer, _lbrac);
token->string = "[";
lexer_adv(lexer, 1);
break;
case ']':
token = token_new(lexer, _rbrac);
token->string = "]";
lexer_adv(lexer, 1);
break;
case ';':
token = token_new(lexer, _terml);
token->string = ";";
lexer_adv(lexer, 1);
break;
default:
if (isalpha(c) || c == '_')
token = lexer_ident(lexer);
else if (isdigit(c) || c == '.') {
token = lexer_num(lexer);
} else if (isspace(c))
lexer_adv(lexer, 1);
else
token = token_new(lexer, _eof);

break;
}
}

return token;
}

void lexer_print(lexer_t* lexer) {
size_t line = lexer->line;
size_t pos = lexer->pos;
size_t ptr = lexer->ptr;
token_t* token = next_token(lexer);

while (token != NULL && token->type != _eof) {
token_print(token);
token = next_token(lexer);
}

lexer->ptr = ptr;
lexer->pos = pos;
}

lexer_t* lexer_open(const char* file_name) {
FILE* file_ptr = fopen(file_name, "rb");
lexer_t* lexer = malloc(sizeof(lexer_t));
lexer->errors = errorlist_new();
lexer->line = 1;
lexer->pos = 0;
lexer->ptr = 0;

if (file_ptr == NULL) {
error_new(lexer->errors, 0, 0, "Couldent open file \"%s\".\n", file_name);
fclose(file_ptr);
free(lexer);
}

if (fseek(file_ptr, 0, SEEK_END) != 0) {
fclose(file_ptr);
return NULL;
}

lexer->len = ftell(file_ptr);
if (lexer->len == -1) {
error_new(lexer->errors, 0, 0, "Unable to get the size of file \"%s\".\n", file_name);
fclose(file_ptr);
free(lexer);
}
fseek(file_ptr, 0, SEEK_SET);

lexer->src = malloc(lexer->len);
size_t r = fread(lexer->src, lexer->len, 1, file_ptr);
fclose(file_ptr);
return lexer;
}

void lexer_close(lexer_t* lexer) {
if (lexer->src != NULL)
free(lexer->src);

free(lexer);
}


Header

#ifndef LEXER_H
#define LEXER_H

#include "error.h"
#include <stdio.h>
#include <stdbool.h>
#include <malloc.h>
#include <assert.h>

typedef enum tk_type {
// primitives
_notype,
_str,
_gen_num,
_ident,
_type,

// symbols
_rbrac,
_lbrac,
_rpara,
_lpara,
_rcurl,
_lcurl,
_terml,

_assign,
_bang,

_add,
_addeql,
_sub,
_subeql,
_div,
_diveql,
_mul,
_muleql,
_exp,
_mod,

// comparison operators
_lt,
_lteq,
_gt,
_gteq,
_eqto,
_nteq,
_and,
_or,

// keywords
_while,
_for,
_if,
_else,
_match,
_case,
_return,
_break,
_int,
_float,
_enum,
_true,
_false,
_import,
_struct,
_mac,
_dbl,
_void,
_char,
_bool,
_const,

// abstract
_block,
_eof
} tk_type;

typedef struct token_t {
tk_type type;
size_t line;
size_t pos;

union {
char* string;
double flt;
size_t integer;
};
} token_t;

typedef struct lexer_t {
size_t line;
size_t pos;
size_t ptr;
size_t len;
char* src;

errorlist_t* errors;
} lexer_t;

void lexer_print(lexer_t* lexer);

#endif


Input

int main() {
int var = 10 + 2;
}

Answer

The obvious problem in lexer_int would seem to be at the end:

token->integer = v;
token->string = "";

Since token_t contains an anonymous union overlaying the integer, flt and string fields, this stores the number read, then immediately overwrites it with a pointer to the static string literal "". You want to remove the token->string = ""; line.

Of course, then your token_print routine will likely crash because it is trying to read the string field even when the token is not a string.

lexer_float has the same problem...

Comments