Submersed24 Submersed24 - 14 days ago 5
C Question

Frequency of letters - disproportionately high ratio

This project computes the average letter frequency of letters in a book - for some reason, A,B, and Z are getting drastically bigger numbers - in the billions. Am I missing something here? I feel like there is no error, but the resulting code is wrong.

argv[1]
is the book file and
argv[2]
is the new file.

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define NUM_LETTERS 26

int main( int argc, char *argv[] )
{
FILE *fp,*fp2;
int ch, totalcounter = 0, i;
int letters[25];
double letterfrequency[25];
if(argc < 2)
{
fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
exit(1);
}
for(i=0; i<NUM_LETTERS; i++)
{
letters[i] = 0;
letterfrequency[i] = 0.0;
}
printf("Opening: %s", argv[1]);
fp = fopen(argv[1], "r");
if(!fp)
{
perror("fopen");
exit(1);
}
while((ch=fgetc(fp)) != EOF)
{
ch = toupper(ch);
if('A' <= ch && ch <= 'Z')
{
ch -= 65;
letters[ch]++;
totalcounter++;
}
}

fp2 = fopen(argv[2], "w");
for(i=0; i<NUM_LETTERS; i++)
{
letterfrequency[i] = (double)letters[i]/totalcounter;
}
for(i=0; i<NUM_LETTERS; i++)
{
fprintf(fp2, "\n%c: Times used: %10d\tFrequency Used: %20.20lf", i+65, letters[i], letterfrequency[i]);
}
fclose(fp);
fclose(fp2);
return 0;
}

Answer

The fundamental problem is that you are trying to squeeze the counts for 26 letters into space reserved for 25. This does not lead to happiness.

This is a minor cleanup of your code. Apart from changing the array sizes to NUM_LETTERS, it reports the usage better, checks that the output file is opened, uses isalpha() to check for letters, and converts from upper-case to offset by subtracting 'A' rather than 65.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define NUM_LETTERS 26

int main(int argc, char *argv[])
{
    FILE *fp, *fp2;
    int ch, totalcounter = 0, i;
    int letters[NUM_LETTERS];
    double letterfrequency[NUM_LETTERS];

    if (argc != 3)
    {
        fprintf(stderr, "Usage: %s infile outfile\n", argv[0]);
        exit(1);
    }

    for (i = 0; i < NUM_LETTERS; i++)
    {
        letters[i] = 0;
        letterfrequency[i] = 0.0;
    }

    printf("Opening: %s\n", argv[1]);
    fp = fopen(argv[1], "r");
    if (!fp)
    {
        perror("fopen");
        exit(1);
    }

    while ((ch = fgetc(fp)) != EOF)
    {
        if (isalpha(ch))
        {
            ch = toupper(ch) - 'A';
            letters[ch]++;
            totalcounter++;
        }
    }
    fclose(fp);

    for (i = 0; i < NUM_LETTERS; i++)
    {
        letterfrequency[i] = (double)letters[i] / totalcounter;
    }

    printf("Opening: %s\n", argv[2]);
    fp2 = fopen(argv[2], "w");
    if (!fp)
    {
        perror("fopen");
        exit(1);
    }

    for (i = 0; i < NUM_LETTERS; i++)
    {
        fprintf(fp2, "%c: Times used: %10d\tFrequency Used: %20.20lf\n", i + 65, letters[i], letterfrequency[i]);
    }
    fclose(fp2);
    return 0;
}

Example output (on the source code above — fq97.c):

Opening: fq97.c
Opening: /dev/stdout
A: Times used:         17   Frequency Used: 0.02956521739130434784
B: Times used:          3   Frequency Used: 0.00521739130434782650
C: Times used:         26   Frequency Used: 0.04521739130434782733
D: Times used:         13   Frequency Used: 0.02260869565217391366
E: Times used:         78   Frequency Used: 0.13565217391304348893
F: Times used:         41   Frequency Used: 0.07130434782608695288
G: Times used:         13   Frequency Used: 0.02260869565217391366
H: Times used:         13   Frequency Used: 0.02260869565217391366
I: Times used:         50   Frequency Used: 0.08695652173913043237
J: Times used:          0   Frequency Used: 0.00000000000000000000
K: Times used:          0   Frequency Used: 0.00000000000000000000
L: Times used:         33   Frequency Used: 0.05739130434782608453
M: Times used:          8   Frequency Used: 0.01391304347826087008
N: Times used:         43   Frequency Used: 0.07478260869565217517
O: Times used:         25   Frequency Used: 0.04347826086956521618
P: Times used:         26   Frequency Used: 0.04521739130434782733
Q: Times used:          5   Frequency Used: 0.00869565217391304358
R: Times used:         52   Frequency Used: 0.09043478260869565466
S: Times used:         25   Frequency Used: 0.04347826086956521618
T: Times used:         61   Frequency Used: 0.10608695652173913415
U: Times used:         26   Frequency Used: 0.04521739130434782733
V: Times used:          6   Frequency Used: 0.01043478260869565299
W: Times used:          2   Frequency Used: 0.00347826086956521752
X: Times used:          3   Frequency Used: 0.00521739130434782650
Y: Times used:          6   Frequency Used: 0.01043478260869565299
Z: Times used:          0   Frequency Used: 0.00000000000000000000
Comments