MooseMan55 MooseMan55 - 2 months ago 12
C Question

Why is my trie leaking data?

I am trying to make a speller checker using a trie, but the words I try to add to my trie don't seem to be getting put in. Where is the leak?

I've spent hours using a debugger and stepping through my code...

Main Function:

/**
* Implements a spell-checker.
*/

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>

#include "dictionary.h"
#undef calculate
#undef getrusage

// default dictionary
#define DICTIONARY "dictionaries/large"

// prototype
double calculate(const struct rusage *b, const struct rusage *a);

int main(int argc, char *argv[])
{
// check for correct number of args
if (argc != 2 && argc != 3)
{
printf("Usage: speller [dictionary] text\n");
return 1;
}

// structs for timing data
struct rusage before, after;

// benchmarks
double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

// determine dictionary to use
char* dictionary = (argc == 3) ? argv[1] : DICTIONARY;

// load dictionary
getrusage(RUSAGE_SELF, &before);
bool loaded = load(dictionary);
getrusage(RUSAGE_SELF, &after);

// abort if dictionary not loaded
if (!loaded)
{
printf("Could not load %s.\n", dictionary);
return 1;
}

// calculate time to load dictionary
time_load = calculate(&before, &after);

// try to open text
char *text = (argc == 3) ? argv[2] : argv[1];
FILE *fp = fopen(text, "r");
if (fp == NULL)
{
printf("Could not open %s.\n", text);
unload();
return 1;
}

// prepare to report misspellings
printf("\nMISSPELLED WORDS\n\n");

// prepare to spell-check
int index = 0, misspellings = 0, words = 0;
char word[LENGTH+1];

// spell-check each word in text
for (int c = fgetc(fp); c != EOF; c = fgetc(fp))
{
// allow only alphabetical characters and apostrophes
if (isalpha(c) || (c == '\'' && index > 0))
{
// append character to word
word[index] = c;
index++;

// ignore alphabetical strings too long to be words
if (index > LENGTH)
{
// consume remainder of alphabetical string
while ((c = fgetc(fp)) != EOF && isalpha(c));

// prepare for new word
index = 0;
}
}

// ignore words with numbers (like MS Word can)
else if (isdigit(c))
{
// consume remainder of alphanumeric string
while ((c = fgetc(fp)) != EOF && isalnum(c));

// prepare for new word
index = 0;
}

// we must have found a whole word
else if (index > 0)
{
// terminate current word
word[index] = '\0';

// update counter
words++;

// check word's spelling
getrusage(RUSAGE_SELF, &before);
bool misspelled = !check(word);
getrusage(RUSAGE_SELF, &after);

// update benchmark
time_check += calculate(&before, &after);

// print word if misspelled
if (misspelled)
{
printf("%s\n", word);
misspellings++;
}

// prepare for next word
index = 0;
}
}

// check whether there was an error
if (ferror(fp))
{
fclose(fp);
printf("Error reading %s.\n", text);
unload();
return 1;
}

// close text
fclose(fp);

// determine dictionary'size
getrusage(RUSAGE_SELF, &before);
unsigned int n = size();
getrusage(RUSAGE_SELF, &after);

// calculate time to determine dictionary's size
time_size = calculate(&before, &after);

// unload dictionary
getrusage(RUSAGE_SELF, &before);
bool unloaded = unload();
getrusage(RUSAGE_SELF, &after);

// abort if dictionary not unloaded
if (!unloaded)
{
printf("Could not unload %s.\n", dictionary);
return 1;
}

// calculate time to unload dictionary
time_unload = calculate(&before, &after);

// report benchmarks
printf("\nWORDS MISSPELLED: %d\n", misspellings);
printf("WORDS IN DICTIONARY: %d\n", n);
printf("WORDS IN TEXT: %d\n", words);
printf("TIME IN load: %.2f\n", time_load);
printf("TIME IN check: %.2f\n", time_check);
printf("TIME IN size: %.2f\n", time_size);
printf("TIME IN unload: %.2f\n", time_unload);
printf("TIME IN TOTAL: %.2f\n\n",
time_load + time_check + time_size + time_unload);

// that's all folks
return 0;
}

/**
* Returns number of seconds between b and a.
*/
double calculate(const struct rusage *b, const struct rusage *a)
{
if (b == NULL || a == NULL)
{
return 0.0;
}
else
{
return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
(b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
(b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
/ 1000000.0);
}
}


Functions that check to see if a word is in the trie:

/**
* Implements a dictionary's functionality.
*/

#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>

#include "dictionary.h"

int dictionary_size;

bool find(char *word, node *next)
{
word++;
if (next == NULL)
{
return false;
}
else if (word[0] == '\0')
{
if (next->is_word == true)
{
return true;
}
else
{
return false;
}
}
else
{
//go to this nodes chold that is now eual to the first letter of the word
if (word[0] == '\'')
{
return find(word, next->children[26]);
}
else
{
return find(word, next->children[word[0] - 'a']);
}
}
}

/**
* Returns true if word is in dictionary else false.
*/
bool check(const char *word)
{
//get word from text and make it malluable
char w[47];
memcpy(w, word, strlen(word) + 1);
//make sure it's all lower case
for (int i = 0; i < strlen(word); i++)
{
w[i] = tolower(w[i]);
}
//go to the root child node equal to the first letter of the word
if (find(w, root))
{
return true;
}
else
{
return false;
}
}


functions that load a dictionary of words into the trie. (This where the leak is i'm guessing?):

struct node *newNode(char *word, node *next)
{
next = malloc(sizeof(node));
word++;
if (word[0] == '\0')
{
next->is_word = true;
return next;
}
else
{
node *new_node = NULL;
// go to this nodes choild that is now equal to the first letter of the word
if (word[0] == '\'')
{
return next->children[26] = newNode(word, new_node);
}
else
{
return next->children[word[0] - 97] = newNode(word, new_node);
}
}
}


/**
* Loads dictionary into memory. Returns true if successful else false.
*/
bool load(const char *dictionary)
{
dictionary_size = 0;
FILE *dic = fopen(dictionary, "r");
if (dic == NULL)
{
return false;
}
// Initalize root node
root = malloc(sizeof(node));

//get word from dictinary
int ch;
while ((ch = fgetc(dic)) != EOF)
{
char word[47];
fgets(word, 47, dic);
dictionary_size++;
// make sure it's all lower case
for (int i = 0; i < 47; i++)
{
word[i] = tolower(word[i]);
}
// get rid of new line char
char *pos;
if ((pos = strchr(word, '\n')) != NULL)
{
*pos = '\0';
}
printf("%s\n", word);
//go to root nodes child that is equal to the first letter of the word
node *child_node = NULL;
root->children[word[0] - 'a'] = newNode(word, child_node);
}
return true;
}

/**
* Returns number of words in dictionary if loaded else 0 if not yet loaded.
*/
unsigned int size(void)
{
if (dictionary_size != 0)
{
return dictionary_size;
}
return 0;
}

void free_node(node *next)
{
// safety including root node
if(!next) return;

// takes you to end of trie
for (int i = 0; i < 26; i++)
{
free_node(next->children[i]);
}

// base case
free(next);
}

/**
* Unloads dictionary from memory. Returns true if successful else false.
*/
bool unload(void)
{
free_node(root);
return true;
}


Defining the node struct:

typedef struct node
{
bool is_word;
struct node *children[27];
}
node;

node *root;

Answer

This code works using radically rewritten addWord() function in place of addNode().

/* SO 3994-9116 */
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct node
{
    bool is_word;
    struct node *children[27];
} node;

static node *root = 0;
static int dictionary_size = 0;

static void addWord(char *word, node *trie)
{
    assert(islower((unsigned char)word[0]) || word[0] == '\0');
    if (word[0] == '\0')
        trie->is_word = true;
    else
    {
        int code = word[0] - 'a';
        if (trie->children[code] == 0)
        {
            trie->children[code] = calloc(sizeof(node), 1);
            if (trie->children[code] == 0)
            {
                fprintf(stderr, "Memory allocation failed\n");
                exit(1);
            }
        }
        addWord(&word[1], trie->children[code]);
    }
}

static
bool load(const char *dictionary)
{
    dictionary_size = 0;
    FILE *dic = fopen(dictionary, "r");
    if (dic == NULL)
        return false;
    root = calloc(1, sizeof(node));
    if (root == 0)
    {
        fprintf(stderr, "Memory allocation failed\n");
        exit(1);
    }

    char word[47];
    while (fgets(word, sizeof(word), dic) != 0)
    {
        char *pos;
        if ((pos = strchr(word, '\n')) != NULL)
            *pos = '\0';
        dictionary_size++;
        for (int i = 0, len = strlen(word); i < len; i++)
        {
            if (!isalpha((unsigned char)word[i]))
                fprintf(stderr, "Non-alphabetic character in '%s'\n", word);
            else
                word[i] = tolower((unsigned char)word[i]);
        }
        addWord(word, root);
    }
    return true;
}

static void print_subtrie(const char *prefix, char nchar, node *child)
{
    if (child != 0)
    {
        int len = strlen(prefix);
        char buffer[len + 2];
        strcpy(buffer, prefix);
        buffer[len] = nchar;
        buffer[len+1] = '\0';
        if (child->is_word)
            printf("%s\n", buffer);
        for (int i = 0; i < 26; i++)
            print_subtrie(buffer, 'a' + i, child->children[i]);
    }
}

static void print_trie(node *top)
{
    for (int i = 0; i < 26; i++)
        print_subtrie("", 'a' + i, top->children[i]);
}

static void free_trie(node *trie)
{
    if (trie != 0)
    {
        for (int i = 0; i < 27; i++)
        {
            if (trie->children[i] != 0)
                free_trie(trie->children[i]);
        }
        free(trie);
    }
}

int main(void)
{
    if (load("words.list"))
    {
        printf("Nominal dictionary size: %d\n", dictionary_size);
        print_trie(root);
        free_trie(root);
    }
    return 0;
}

Test File 1:

aardvark
abelone
assignation
assignment
assigned
assert
assertion

Output File 1:

Nominal dictionary size: 7
aardvark
abelone
assert
assertion
assignation
assigned
assignment

Test File 2:

Mike
Aquarius
delta
assert
zulu
assertion
alpha
aardvark
abelone
culmination
Zulu
kilo
amniocentesis
Oscar
Yankee
cabal
echo
Sierra
duck
Charlie
Quebec
centre
assigned
Papa
foxtrot
uMBRelLA
Romeo
cab
India
Juliet
bravo
amniocentesis
hotel
amniocentesis
Lima
Whisky
cab
culminate
cold
Xray
cab
cabs
assignation
cam
Victor
golf
Tango
November
cinema
cabbie
assignment

Output File 2:

Nominal dictionary size: 56
a
aardvark
abelone
alpha
amniocentesis
aquarius
as
assert
assertion
assign
assignation
assigned
assignment
bravo
cab
cabal
cabbie
cabs
cam
centre
charlie
cinema
cold
culminate
culmination
delta
duck
echo
foxtrot
golf
hotel
i
india
juliet
kilo
lima
mike
november
o
oscar
papa
quebec
romeo
sierra
tango
umbrella
victor
whisky
xray
yankee
zulu

Note that the 'nominal size' of 56 is too large; there are 46 distinct words in the input and hence in the output.