/*--------------------------------------------------------------------*/ /* dfa.c */ /* Author: Bob Dondero */ /* Illustrate lexical analysis using a deterministic finite state */ /* automaton (DFA) */ /*--------------------------------------------------------------------*/ #include "dynarray.h" #include #include #include #include #include /*--------------------------------------------------------------------*/ enum {MAX_LINE_SIZE = 1024}; enum {FALSE, TRUE}; enum TokenType {TOKEN_NUMBER, TOKEN_WORD}; enum LexState {STATE_START, STATE_IN_NUMBER, STATE_IN_WORD, STATE_ERROR, STATE_EXIT}; /*--------------------------------------------------------------------*/ /* A Token is either a number or a word, expressed as a string. */ struct Token { /* The type of the token. */ enum TokenType eType; /* The string which is the token's value. */ char *pcValue; }; /*--------------------------------------------------------------------*/ static void freeToken(void *pvItem, void *pvExtra) /* Free token pvItem. pvExtra is unused. */ { struct Token *psToken = (struct Token*)pvItem; free(psToken->pcValue); free(psToken); } /*--------------------------------------------------------------------*/ static void printNumberToken(void *pvItem, void *pvExtra) /* Print token pvItem to stdout iff it is a number. pvExtra is unused. */ { struct Token *psToken = (struct Token*)pvItem; if (psToken->eType == TOKEN_NUMBER) printf("%s ", psToken->pcValue); } /*--------------------------------------------------------------------*/ static void printWordToken(void *pvItem, void *pvExtra) /* Print token pvItem to stdout iff it is a word. pvExtra is unused. */ { struct Token *psToken = (struct Token*)pvItem; if (psToken->eType == TOKEN_WORD) printf("%s ", psToken->pcValue); } /*--------------------------------------------------------------------*/ static struct Token *makeToken(enum TokenType eTokenType, char *pcValue) /* Create and return a Token whose type is eTokenType and whose value consists of string pcValue. The caller owns the Token. */ { struct Token *psToken; psToken = (struct Token*)malloc(sizeof(struct Token)); assert(psToken != NULL); psToken->eType = eTokenType; psToken->pcValue = (char*)malloc(strlen(pcValue) + 1); assert(psToken->pcValue != NULL); strcpy(psToken->pcValue, pcValue); return psToken; } /*--------------------------------------------------------------------*/ static int lexLine(const char *pcLine, DynArray_T oTokens) /* Lexically analyze string pcLine. Populate oTokens with the tokens that pcLine contains. Return TRUE if successful, and FALSE if pcLine contains a lexical error. In the latter case, oTokens may contain tokens that were discovered before the lexical error. The caller owns the tokens placed in oTokens. */ /* lexLine() uses a DFA approach. It "reads" its characters from pcLine.*/ { enum LexState eState = STATE_START; int iLineIndex = 0; int iValueIndex = 0; char c; char acValue[MAX_LINE_SIZE]; struct Token *psToken; while ((eState != STATE_EXIT) && (eState != STATE_ERROR)) { /* "Read" the next character from pcLine. */ c = pcLine[iLineIndex++]; switch (eState) { case STATE_START: if ((c == '\n') || (c == '\0')) eState = STATE_EXIT; else if (isdigit(c)) { acValue[iValueIndex++] = c; eState = STATE_IN_NUMBER; } else if (isalpha(c)) { acValue[iValueIndex++] = c; eState = STATE_IN_WORD; } else if ((c == ' ') || (c == '\t')) eState = STATE_START; else eState = STATE_ERROR; break; case STATE_IN_NUMBER: if ((c == '\n') || (c == '\0')) { /* Create a NUMBER token. */ acValue[iValueIndex] = '\0'; psToken = makeToken(TOKEN_NUMBER, acValue); DynArray_add(oTokens, psToken); iValueIndex = 0; eState = STATE_EXIT; } else if (isdigit(c)) { acValue[iValueIndex++] = c; eState = STATE_IN_NUMBER; } else if ((c == ' ') || (c == '\t')) { /* Create a NUMBER token. */ acValue[iValueIndex] = '\0'; psToken = makeToken(TOKEN_NUMBER, acValue); DynArray_add(oTokens, psToken); iValueIndex = 0; eState = STATE_START; } else eState = STATE_ERROR; break; case STATE_IN_WORD: if ((c == '\n') || (c == '\0')) { /* Create a WORD token. */ acValue[iValueIndex] = '\0'; psToken = makeToken(TOKEN_WORD, acValue); DynArray_add(oTokens, psToken); iValueIndex = 0; eState = STATE_EXIT; } else if (isalpha(c)) { acValue[iValueIndex++] = c; eState = STATE_IN_WORD; } else if ((c == ' ') || (c == '\t')) { /* Create a WORD token. */ acValue[iValueIndex] = '\0'; psToken = makeToken(TOKEN_WORD, acValue); DynArray_add(oTokens, psToken); iValueIndex = 0; eState = STATE_START; } else eState = STATE_ERROR; break; default: assert(0); } } if (eState == STATE_ERROR) return FALSE; return TRUE; } /*--------------------------------------------------------------------*/ int main(void) /* Read a line from stdin, and write to stdout each number and word that it contains. Repeat until EOF. Return 0. */ { char acLine[MAX_LINE_SIZE]; DynArray_T oTokens; int iSuccessful; printf("------------------------------------\n"); while (fgets(acLine, MAX_LINE_SIZE, stdin) != NULL) { oTokens = DynArray_new(0); iSuccessful = lexLine(acLine, oTokens); if (iSuccessful) { printf("Numbers: "); DynArray_map(oTokens, printNumberToken, NULL); printf("\n"); printf("Words: "); DynArray_map(oTokens, printWordToken, NULL); printf("\n"); } else printf("Invalid line\n"); printf("------------------------------------\n"); DynArray_map(oTokens, freeToken, NULL); DynArray_free(oTokens); } return 0; }