Anav Gagneja Anav Gagneja - 19 days ago 7
C Question

Parsing string to get comma-separated integer character pairs

I'm working on a project where I'm given a file that begins with a header in this format:

a1,b3,t11, 2,,5,\3,*4,344,00,
. It is always going be a sequence of a single ASCII character followed by an integer separated by a comma with the sequence always ending with
00,
.

Basically what I have to do is go through this and put each character/integer pair into a data type I have that takes both of these as parameters and make a vector of these. For example, the header I gave above would be a vector with
('a',1), ('b',3),('t',11),(',',5)(' ',2),('\',3),('*',4),('3',44)
as elements.

I'm just having trouble parsing it. So far I've:
  • Extracted the header from my text file from the first character up until before the ',00,' where the header ends. I can get the header string in string format or as a vector of characters (whichever is easier to parse)
  • Tried using sscanf to parse the next character and the next int then adding those into my vector before using substrings to remove the part of the string I've already analyzed (this was messy and did not get me the right result)
  • Tried going through the string as a vector and checking each element to see if it is an integer, a character, or a comma and acting accordingly but this doesn't work for multiple-digit integers or when the character itself is an int

    I know I can fairly easily split my string based on the commas but I'm not sure how to do this and still split the integers from the characters while retaining both and accounting for integers that I need to treat as characters.

    Any advice or useful standard library or string functions would be greatly appreciated.

  • Answer

    One possibility, of many, would be to store the data in a structure. This uses an array of structures but the structure could be allocated as needed with malloc and realloc.
    Parsing the string can be accomplished using pointers and strtol which will parse the integer and give a pointer to the character following the integer. That pointer can be advanced to use in the next iteration to get the ASCII character and integer.

    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    
    #define SIZE 100
    
    struct pair {
        char ascii;
        int  integer;
    };
    
    int main( void) {
        char input[] = "a1,b3,!0,t11, 2,,5,\\3,*4,34400,";
        char *pt = input;//start with pt pointing to first character of input
        char *end = input;
        int each = 0;
        int loop = 0;
        int length = 0;
        struct pair pairs[SIZE] = { { '\0', 0}};
    
        //assuming input will always end in 00, ( or ,00,)
        //remove those three ( or 4 ??) characters
        length = strlen ( input);
        if ( length > 3) {
            input[length - 3] = '\0';
        }
        for ( each = 0; each < SIZE; each++) {
            //get the ASCII character and advance one character
            pairs[each].ascii = *pt;
            pt++;
            //get the integer
            pairs[each].integer = strtol ( pt, &end, 10);
            //end==pt indicates the expected integer is missing
            if ( end == pt) {
                printf ( "expected an integer\n");
                break;
            }
            //at the end of the string?
            if ( *end == '\0') {
                //if there are elements remaining, add one to each as one more was used
                if ( each < SIZE - 1) {
                    each++;
                }
                break;
            }
            //the character following the integer should be a comma
            if ( *end != ',') {
                //if there are elements remaining, add one to each as one more was used
                if ( each < SIZE - 1) {
                    each++;
                }
                printf ( "format problem\n");
                break;
            }
            //for the next iteration, advance pt by one character past end
            pt = end + 1;
        }
        //loop through and print the used structures
        for ( loop = 0; loop < each; loop++) {
            printf ( "ascii[%d] = %c   ", loop, pairs[loop].ascii);
            printf ( "integer[%d] = %d\n", loop, pairs[loop].integer);
        }
    
        return 0;
    }
    

    Another option is to use dynamic allocation.
    This also uses sscanf to parse the input. The %n will capture the number of characters processed by the scan. The offset and add variables can then be used to iterate through the input. The last scan will only capture the ascii character and the integer and the return from sscanf will be 2.

    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    
    struct pair {
        char ascii;
        int  integer;
    };
    
    int main( void) {
        char input[] = "a1,b3,!0,t11, 2,,5,\\3,*4,34400,";
        char comma = '\0';
        char ascii = '\0';
        int integer = 0;
        int result = 0;
        int loop = 0;
        int length = 0;
        int used = 0;
        int add = 0;
        int offset = 0;
        struct pair *pairs = NULL;//so realloc will work on first call
        struct pair *temp = NULL;
    
        //assuming input will always end in 00, ( or ,00,)
        //remove those three ( or 4 ??) characters
        length = strlen ( input);
        if ( length > 3) {
            input[length - 3] = '\0';
        }
        while ( ( result = sscanf ( &input[offset], "%c%d%c%n"
        , &ascii, &integer, &comma, &add)) >= 2) {//the last scan will only get two items
            if ( ( temp = realloc ( pairs, ( used + 1) * sizeof ( *pairs))) == NULL) {
                fprintf ( stderr, "problem allocating\n");
                break;
            }
            pairs = temp;
    
            pairs[used].ascii = ascii;
            pairs[used].integer = integer;
            //one more element was used
            used++;
            //the character following the integer should be a comma
            if ( result == 3 && comma != ',') {
                printf ( "format problem\n");
                break;
            }
            //for the next iteration, add to offset
            offset += add;
        }
        for ( loop = 0; loop < used; loop++) {
            printf ( "ascii[%d] = %c   ", loop, pairs[loop].ascii);
            printf ( "value[%d] = %d\n", loop, pairs[loop].integer);
        }
    
        free ( pairs);
    
        return 0;
    }
    
    Comments