EricFlorentNoube EricFlorentNoube - 3 months ago 8
C++ Question

Parsing a string : my code apparentely works after testing, but is inelegant

This is a

c++
question. I want the code of the desired
c++
function (see below) to be (as much as possible) in
c
-style and using
c
string library functions because I think that this will lead the quickest code in terms of execution time. (Am I wrong ? If so, how much ?) Yes, I value performance more than readability for this question because the desired function will be called a lot (millions) of times.

I am receiving
const char *
's of the form "25Dec2016" that represent dates and I am parsing them to back out from them three
int
's (one for the day, the second for the month and the last for the year (that I assumed to be a number between 0 and 9999)) thanks to a function

Parse(const char * cDate, int & day, int & month, int & year)


I coded such a function and tested it : it works on correct
const char*
s (those that indeed represent date in my format), but I feel that my use of
c
functions (
atoi
for instance) is incorrect, even if I don't see why. There are also other problems :


  • the code is inelegant (the big
    if ... else if ... if
    ) : one cannot do a
    switch
    statement on a string, but is there an elegant way to do this without resorting the
    std::map
    and
    c++11
    ?

  • surely problematic from a c string point of view (I am not an expert) : for instance, I am really not happy with the way I extract the three substring into "buffers" ... Plus it appears I have problems with not null terminated
    char
    arrays that I'd like to correct. I could force a
    \0
    at the end of
    _day
    and
    _year
    as I did for
    _month
    but I find that doing so is awful, so that I suspect a bad "design" of my parsing function

  • quite bad from an error handling point of view : the function is not a constructor for now, but could finally be, this is the reason why I
    throw
    .



I am open to any comments !

Here is the initial code :

Parse(const char * cDate, int & day, int & month, int & year)
{
if (0 == cDate)
{
throw "Error : null string pointer";
}
else
{
if (strlen(cDate) < 8)
{
throw "Error : invalid string format";
}
else
{
char _day[2];
char _month[4];
char _year[5]; // implictely the biggest year we authorize is 99999 ; it should suffice
for (int i = 0; i < 2; ++i)
{
_day[i] = cDate[i];
}
day = atoi(_day); // if fail, Undefined behaviour, see strtol for a more robust cross-platform alternative
char c;
for (int i = 2; i < 5; ++i)
{
c = cDate[i];
_month[i-2] = toupper(c);
}
_month[3] = '\0';
if (0 == strcmp("JAN", _month))
{
month = 1;
}
else if (0 == strcmp("FEB", _month))
{
month = 2;
}
else if (0 == strcmp("MAR", _month))
{
month = 3;
}
else if (0 == strcmp("APR",_month))
{
month = 4;
}
else if (0 == strcmp("MAY", _month))
{
month = 5;
}
else if (0 == strcmp("JUN", _month))
{
month = 6;
}
else if (0 == strcmp("JUL", _month))
{
month = 7;
}
else if (0 == strcmp("AUG", _month))
{
month = 8;
}
else if (0 == strcmp("SEP", _month))
{
month = 9;
}
else if (0 == strcmp("OCT",_month))
{
month = 10;
}
else if (0 == strcmp("NOV", _month))
{
month = 11;
}
else if (0 == strcmp("DEC", _month))
{
month = 12;
}
else
{
throw "Error : invalid month string";
}
for (int i = 5; i < 10; ++i)
{
_year[i-5] = cDate[i];
}
year = atoi(_year);
}
}
}


I finally opted for the function to be a constructor of a
Date
class, and inspired myself from rici's answer also using
strtol
as I intended initially (see comment in my initial code) instead of
atoi
:

#include <cstring> // for strlen
#include <ctype.h> // for toppuer
#include <stdlib.h>

int up(char c)
{
return toupper((unsigned char)(c));
}

Date::Date(const char * cDate)
{
if (0 == cDate)
{
throw "Error : null string pointer";
}
else
{
if (strlen(cDate) < 8)
{
throw "Error : invalid string format. String format is DDMMMYYYY with M's in upper or lower case"; // for now, valid format is 24Oct1979
}
else
{
char * ppEnd;
int day = strtol(cDate, &ppEnd, 10);
if (0 == day)
throw "Error : invalid string format. String format is DDMMMYYYY with M's in upper or lower case";
m_Day = day;
char cMonth[4];
int month;
memcpy(cMonth, &ppEnd[0], 3);
switch (up(cMonth[0]))
{
case 'A':
{
switch (up(cMonth[1]))
{
case 'P': if (up(cMonth[2]) == 'R') month = 4;
break;
case 'U': if (up(cMonth[2]) == 'G') month = 8;
break;
}
break;
}
case 'D':
{
if (up(cMonth[1]) == 'E' && up(cMonth[2]) == 'C')
month = 12;
break;
}
case 'F':
{
if (up(cMonth[1]) == 'E' && up(cMonth[2]) == 'B')
month = 2;
break;
}
case 'J':
{
switch (up(cMonth[1]))
{
case 'A': if (up(cMonth[2]) == 'N')
month = 1;
break;
case 'U': switch (up(cMonth[2]))
{
case 'N': month = 6;
case 'L': month = 7;
}
break;
}
break;
}
case 'M':
{
if (up(cMonth[1]) == 'A')
{
switch (up(cMonth[2]))
{
case 'R': month = 3;
case 'Y': month = 5;
}
}
break;
}
case 'N':
{
if (up(cMonth[1]) == 'O' && up(cMonth[2]) == 'V') month = 11;
break;
}
case 'O':
{
if (up(cMonth[1]) == 'C' && up(cMonth[2]) == 'T') month = 10;
break;
}
case 'S':
{
if (up(cMonth[1]) == 'E' && up(cMonth[2]) == 'P') month = 9;
break;
}
}
m_Month = (Month)month;
int year = strtol(ppEnd + 3, &ppEnd, 10);
if (0 == year)
throw "Error : invalid string format. String format is DDMMMYYYY with M's in upper or lower case";
m_Year = year;
updateSerial();
}
}
}


Remark. Being lazy, I did not throw everywhere I should in the "month" part of the code.

Answer

If your system is Posix-compatible, then you could just use strptime with the format %d%b%Y:

bool Parse(const char* date, int& day, int& month, int& year) {
    struct tm components;
    const char* rest = strptime(date, "%d%b%Y", &components);
    if (rest == NULL || *rest != '\0') return false;
    day = components.tm_mday;
    month = components.tm_mon;
    year = components.tm_year + 1900;
    return true;
}

That is likely to be as fast as a naive parser, and it is certainly a lot easier to write :)

Otherwise, you should use strtol rather than atoi, since it will let you know both whether the parse was successful and where the next character is. And if you want to parse the month names quickly, you'll want to build a trie, either as a table or directly in code (the table is probably faster, fwiw):

static int up(char c) { return toupper((unsigned char)(c)); }
int parseMonth(const char* p) {
  switch (up(p[0])) {
    case 'A': {
      switch (up(p[1])) {
        case 'P': if (up(p[2]) == 'R') return 4;
                  break;
        case 'U': if (up(p[2]) == 'G') return 8;
                  break;
      }
      break;
    }
    case 'D': {
      if (up(p[1]) == 'E' && up(p[2]) == 'C') return 12;
      break;
    }
    case 'F': {
      if (up(p[1]) == 'E' && up(p[2]) == 'B') return 2;
      break;
    }
    case 'J': {
      switch (up(p[1])) {
        case 'A': if (up(p[2]) == 'N') return 1;
                  break;
        case 'U': switch (up(p[2])) {
                    case 'N': return 6;
                    case 'L': return 7;
                  }
                  break;
      }
      break;
    }
    case 'M': {
      if (up(p[1]) == 'A') {
        switch (up(p[2])) {
          case 'R': return 3;
          case 'Y': return 5;
        }
      }
      break;
    }
    case 'N': {
      if (up(p[1]) == 'O' && up(p[2]) == 'V') return 11;
      break;
    }
    case 'O': {
      if (up(p[1]) == 'C' && up(p[2]) == 'T') return 10;
      break;
    }
    case 'S': {
      if (up(p[1]) == 'E' && up(p[2]) == 'P') return 9;
      break;
    }
  }
  return -1;
}
Comments