#include #include #include #include #include "halium.h" #include #include #define error(x)\ fprintf(stderr, "File %s, line %i: %s", __FILE__, __LINE__, x); #define errorf(f, x)\ fprintf(stderr, "File %s, line %i: " # f, __FILE__, __LINE__, x); /* --- Prototypes ---------------------------------------------------------- */ static inline int is_word_character(char); char* lowercase(char*); char** make_word_list(const char*); void free_word_list(char**); static int word_list_length(char**); static int node_sort(const node_t*, const node_t*); int word_find(const char*, const node_t*); model_t* make_model(int); void free_model(model_t*); static node_t* make_node(void); static void free_node(node_t*); static void initialize_context(model_t*); void learn(model_t*, char**); static void update_model(model_t*, char*); /* --- Definitions --------------------------------------------------------- */ /* True if c is considered part of word */ int is_word_character(char c) { return strchr(" \n\t\",.;:?!&()[]{}+=/", c) == NULL; } /* Convert a string to lowercase */ char* lowercase(char* s) { char* p; for (p = s; *p != '\0'; p++) *p = tolower(*p); return s; } /* Break a string into a list of words and non-words */ char** make_word_list(const char* s) { char** list; /* List to return */ char** list_p; /* Next element */ int list_len; /* Maximum entries in list */ const char* b = s; /* Start of word or non-word */ int word_flag; /* Now forming a word? */ int i; /* Generic iterator */ if (*s == '\0') { list = malloc(sizeof(char*)); if (list == NULL) { error("Unable to allocate word list\n"); return NULL; } list[0] = NULL; return list; } word_flag = is_word_character(*s); /* Guess one word and one non-word per six characters */ list_len = strlen(s) / 3; if (list_len < 2) list_len = 2; list_p = list = malloc(sizeof(char*) * list_len); if (list == NULL) { error("Unable to allocate word list\n"); return NULL; } for (i = 0; i < list_len; i++) list[i] = NULL; while (*s != '\0') { /* Check for boundary */ s++; if (*s != '\0' && is_word_character(*s) == word_flag) continue; /* Add to word list */ *list_p = malloc(s - b + 1); if (*list_p == NULL) { free_word_list(list); error("Unable to allocate new word for list\n"); return NULL; } memcpy(*list_p, b, s - b); (*list_p)[s - b] = '\0'; /* Advance to next slot in list */ list_p++; if (list_p == &list[list_len]) { /* Allocate space for more entries */ int len2 = list_len + 6; char** list2 = realloc(list, sizeof(char*) * len2); if (list2 == NULL) { free_word_list(list); error("Unable to expand word list\n"); return NULL; } list = list2; list_p = list + list_len; for (i = list_len; i < len2; i++) list[i] = NULL; list_len = len2; } /* Prepare to find next word or non-word */ word_flag = !word_flag; b = s; } return list; } /* Deallocate a word list and all words in it */ void free_word_list(char** l) { char** l_p; if (l == NULL) return; for (l_p = l; *l_p != NULL; l_p++) free(*l_p); free(l); } /* Return the number of entries in word list l */ int word_list_length(char** l) { int i = 0; for (; *l != NULL; l++, i++); return i; } /* Sort nodes alphabetically by word */ int node_sort(const node_t* a, const node_t* b) { return strcmp(a->word, b->word); } /* Find a node with a given word */ int word_find(const char* a, const node_t* b) { return strcmp(a, b->word); } /* Allocate and return an empty model */ model_t* make_model(int order) { int i; model_t* m = malloc(sizeof(model_t)); m->order = order; m->forward = newtree234((cmpfn234)node_sort); m->backward = newtree234((cmpfn234)node_sort); m->context = malloc(sizeof(tree234*) * (order + 2)); m->dictionary = newtree234((cmpfn234)strcmp); initialize_context(m); return m; } /* Free a model and all its related data structures */ void free_model(model_t* m) { freetree234(m->forward); freetree234(m->backward); free(m->context); freetree234(m->dictionary); } /* Allocate and return an empty node */ node_t* make_node(void) { node_t* n = malloc(sizeof(node_t)); if (n == NULL) { error("Unable to allocate node\n"); return NULL; } n->tree = newtree234((cmpfn234)node_sort); if (n->tree == NULL) { free(n); error("Unable to allocate tree for node\n"); return NULL; } n->word = NULL; n->usage = 0; return n; } /* Free a node and its related data structures */ void free_node(node_t* n) { int i; node_t* p; /* Recursively free subtrees */ if (n->tree != NULL) { for (i = 0; (p = index234(n->tree, i)) != NULL; i++) free_node(p); freetree234(n->tree); } /* Free this node */ free(n); } /* Reset a model's context */ void initialize_context(model_t* m) { int i; for (i = 0; i < m->order + 1; i++) m->context[i] = NULL; } /* Add word list l to model m */ void learn(model_t* m, char** l) { int i; char* entry; int length = word_list_length(l); /* Ignore short input */ if (length <= m->order) return; /* Train model in forward direction */ initialize_context(m); m->context[0] = m->forward; for (i = 0; i < length; i++) { entry = find234(m->dictionary, l[i], NULL); if (entry == NULL) { /* Add to dictionary */ char* new_entry = strdup(l[i]); if (new_entry == NULL) { error("Unable to allocate new word\n"); return; } entry = add234(m->dictionary, new_entry); if (entry == NULL) { free(new_entry); error("Unable to add word to dictionary\n"); return; } } update_model(m, entry); } /* Train model in backward direction */ initialize_context(m); m->context[0] = m->backward; for (i = length - 1; i >= 0; i--) { entry = find234(m->dictionary, l[i], NULL); update_model(m, entry); } } /* Update model m with word w */ void update_model(model_t* m, char* w) { int i; for (i = m->order + 1; i > 0; i--) { node_t* node; if (m->context[i - 1] == NULL) continue; /* Find w in the list */ node = find234(m->context[i - 1], w, (cmpfn234)word_find); if (node == NULL) { /* Add w to the list */ node = make_node(); if (node == NULL) { error("Unable to add node to model\n"); return; } node->word = w; node->usage = 1; add234(m->context[i - 1], node); } else if (node->usage < INT_MAX) { node->usage++; } /* Add to the next context */ m->context[i] = node->tree; } }