lyric lyric - 1 month ago 12
C Question

segmentation fault in c program with a trie

I'm getting a seg fault with this spell check program. I tried gnu and valgrind but i can't figure out what is going on. (node is defined in a .h file)
the program is a spell check for cs50x
it initially worked on the smaller dictionary but now it seg faults while running that as well.

here is the node:

typedef struct node
{
bool is_word;
struct node* letters[27];
}
node;


and here is the speller.c from the course - written by the course staff:

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>

#include "dictionary.h"
#undef calculate
#undef getrusage

// default dictionary
#define DICTIONARY "dictionaries/large"

// prototype
double calculate(const struct rusage* b, const struct rusage* a);

int main(int argc, char* argv[])
{
// check for correct number of args
if (argc != 2 && argc != 3)
{
printf("Usage: speller [dictionary] text\n");
return 1;
}

// structs for timing data
struct rusage before, after;

// benchmarks
double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

// determine dictionary to use
char* dictionary = (argc == 3) ? argv[1] : DICTIONARY;

// load dictionary
getrusage(RUSAGE_SELF, &before);
bool loaded = load(dictionary);
getrusage(RUSAGE_SELF, &after);

// abort if dictionary not loaded
if (!loaded)
{
printf("Could not load %s.\n", dictionary);
return 1;
}

// calculate time to load dictionary
time_load = calculate(&before, &after);

// try to open text
char* text = (argc == 3) ? argv[2] : argv[1];
FILE* fp = fopen(text, "r");
if (fp == NULL)
{
printf("Could not open %s.\n", text);
unload();
return 1;
}

// prepare to report misspellings
printf("\nMISSPELLED WORDS\n\n");

// prepare to spell-check
int index = 0, misspellings = 0, words = 0;
char word[LENGTH+1];

// spell-check each word in text
for (int c = fgetc(fp); c != EOF; c = fgetc(fp))
{
// allow only alphabetical characters and apostrophes
if (isalpha(c) || (c == '\'' && index > 0))
{
// append character to word
word[index] = c;
index++;

// ignore alphabetical strings too long to be words
if (index > LENGTH)
{
// consume remainder of alphabetical string
while ((c = fgetc(fp)) != EOF && isalpha(c));

// prepare for new word
index = 0;
}
}

// ignore words with numbers (like MS Word can)
else if (isdigit(c))
{
// consume remainder of alphanumeric string
while ((c = fgetc(fp)) != EOF && isalnum(c));

// prepare for new word
index = 0;
}

// we must have found a whole word
else if (index > 0)
{
// terminate current word
word[index] = '\0';

// update counter
words++;

// check word's spelling
getrusage(RUSAGE_SELF, &before);
bool misspelled = !check(word);
getrusage(RUSAGE_SELF, &after);

// update benchmark
time_check += calculate(&before, &after);

// print word if misspelled
if (misspelled)
{
printf("%s\n", word);
misspellings++;
}

// prepare for next word
index = 0;
}
}

// check whether there was an error
if (ferror(fp))
{
fclose(fp);
printf("Error reading %s.\n", text);
unload();
return 1;
}

// close text
fclose(fp);

// determine dictionary's size
getrusage(RUSAGE_SELF, &before);
unsigned int n = size();
getrusage(RUSAGE_SELF, &after);

// calculate time to determine dictionary's size
time_size = calculate(&before, &after);

// unload dictionary
getrusage(RUSAGE_SELF, &before);
bool unloaded = unload();
getrusage(RUSAGE_SELF, &after);

// abort if dictionary not unloaded
if (!unloaded)
{
printf("Could not unload %s.\n", dictionary);
return 1;
}

// calculate time to unload dictionary
time_unload = calculate(&before, &after);

// report benchmarks
printf("\nWORDS MISSPELLED: %d\n", misspellings);
printf("WORDS IN DICTIONARY: %d\n", n);
printf("WORDS IN TEXT: %d\n", words);
printf("TIME IN load: %.2f\n", time_load);
printf("TIME IN check: %.2f\n", time_check);
printf("TIME IN size: %.2f\n", time_size);
printf("TIME IN unload: %.2f\n", time_unload);
printf("TIME IN TOTAL: %.2f\n\n",
time_load + time_check + time_size + time_unload);

// that's all folks
return 0;
}

/**
* Returns number of seconds between b and a.
*/
double calculate(const struct rusage* b, const struct rusage* a)
{
if (b == NULL || a == NULL)
{
return 0.0;
}
else
{
return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
(b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
(b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
/ 1000000.0);
}
}

// prototype of build trie function and unload
int build_trie(int letter, node* root, FILE* dict);
bool unloader(node* head);

// declare global variables
node* root;
int total_words = 0;

/**
* Returns true if word is in dictionary else false.
*/
bool check(const char* word)
{
// iniitialize pointer trav to point to same as root
node* trav = root;

// iterate over the word to see if letters are "open" paths in trie
int i = 0;
while (word[i] != '\0')
{
// check if letter
if (isalpha(word[i]))
{
if (trav->letters[tolower(word[i]) -97] == NULL)
{
return false;
}
else
{
trav = trav->letters[tolower(word[i]) -97];
i++;
}
}
else if (word[i] == '\'')
{
if (trav->letters[26] == NULL)
{
return false;
}
else
{
trav = trav->letters[26];
i++;
}
}
else
{
return false;
}
}

// at the end check if word
if (word[i] == '\0')
return (trav->is_word);
else
return false;
}

/**
* Loads dictionary into memory. Returns true if successful else false.
*/
bool load(const char* dictionary)
{
// open a file to read the dictionary
FILE* dict = fopen(dictionary, "r");
if (dict == NULL)
{
printf("Could not open %s.\n", dictionary);
return 1;
}

// create root
root = (node*)malloc(sizeof(node));

// initialize all pointers in root to NULL
for (int i = 0; i < 27; i++)
{
root->letters[i] = NULL;
}

// initialize trav
node* trav = root;
// initialize temp variable to store new word
int letter;

// while loop to read through the dictionary
while ((letter = fgetc(dict)) != EOF)
{
build_trie(letter, trav, dict);
}

if (letter == EOF)
return true;
else
return false;
}

// build a trie function
// to be used within load function recursively

int build_trie(int letter, node* trav, FILE* dict)
{
// set up place as value of letter - letters array place to "open"
int place = letter;


// set base case for recursive function -> end of string "\n"
if (letter == '\n')
{
trav->is_word = true;
total_words++;
for (int i = 0; i < 27; i++)
{
trav->letters[i] = NULL;
}
return 0;
}

// recursive part
else
{
// determine where in letters arrays to go
if (isalpha(letter))
{
place = letter - 97;
}
else if (letter == 44)
{
place = 26;
}

// check to see if new node exists
// if not - create new node and recurse with new letter and pointer
if (trav->letters[place] == NULL)
{
trav->letters[place] = (node*)malloc(sizeof(node));

// initialize new nodes pointers to NULL
for (int i = 0; i < 27; i++)
{
trav->letters[place]->letters[i] = NULL;
}

letter = fgetc(dict);
return(build_trie(letter, trav->letters[place], dict));
}

// if it does exist - get new letter - and recurse with pointer to that node
else
{
letter = fgetc(dict);
return build_trie(letter, trav->letters[place], dict);
}
}
return 1;
}

/**
* Returns number of words in dictionary if loaded else 0 if not yet loaded.
*/
unsigned int size(void)
{
return total_words;
}

/**
* Unloads dictionary from memory. Returns true if successful else false.
*/
bool unload(void)
{
node* head = root;
return unloader(head);
}

/**
* recursive function to free the trie takes in the "head" of the trie
*/
// use recursion to free all nodes
bool unloader(node* head)
{
// check base case - all null pointers -> free
int total = 0;
for (int i = 0; i < 27; i++)
{
if (head->letters[i] == NULL)
{
total++;
}
}

// base case all 27 pointers == NULL
if (total == 27)
{
free(head);
head = NULL;
return true;
}
else
{
for (int i = 0; i < 27; i++)
{
if (head->letters[i] != NULL)
{
unloader(head->letters[i]);
}
}
}
return false;
}

Answer

When looking at your code, there is an incoherence between the build_trie() during the dictionary load and the check() function to search a read word in the dictionary.

In the build_trie() function, the letters[26] is used to the ASCII code 44 (magic number ?):

if (isalpha(letter))
{
    place = letter - 97;
}
else if (letter == 44) // magic number = ','
{
    place = 26;
}

In the check() function, the letters[26] is used to '(apostrophe).

else if (word[i] == '\'') // apostrophe character = ASCII 39
{
    if (trav->letters[26] == NULL)
    {
        return false;
    }
    else
    {
        trav = trav->letters[26];
        i++;
    }
}

In the build_trie() function, when the place index is calculated, there are the following unexpected behavior:

if (isalpha(letter)) // both uppercase/lowercase are available 
{
    place = letter - 97; // magic number 97 = ASCII 'a'
}
else if (letter == 44) //'\'')
{
    place = 26;
}

if (trav->letters[place] == NULL)

Using the magic number 97 instead of 'a', could occur a negative place index if letter is an uppercase. And before accessing to the letters[place], the index is not checked.

If the letter is not isalpha() nor 44 and nor \n, the place index has been assign to the letter value and could be any 8bits value.