/* * Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada * And Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. All modifications to the source code must be clearly marked as * such. Binary redistributions based on modified source code * must be clearly marked as modified versions in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include <stdio.h> #include <string.h> #include <stdlib.h> #include <errno.h> #include "mythes.hxx" // some basic utility routines // string duplication routine char * mythes_mystrdup(const char * p) { int sl = strlen(p) + 1; char * d = (char *)malloc(sl); if (d) { memcpy(d,p,sl); return d; } return NULL; } // remove cross-platform text line end characters void mythes_mychomp(char * s) { int k = strlen(s); if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; } // return index of char in string int mystr_indexOfChar(const char * d, int c) { const char * p = strchr(d,c); if (p) return (int)(p-d); return -1; } MyThes::MyThes(const char* idxpath, const char * datpath) { nw = 0; encoding = NULL; list = NULL; offst = NULL; if (thInitialize(idxpath, datpath) != 1) { fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath); fflush(stderr); if (encoding) free((void*)encoding); if (list) free((void*)list); if (offst) free((void*)offst); // did not initialize properly - throw exception? } } MyThes::~MyThes() { if (thCleanup() != 1) { /* did not cleanup properly - throw exception? */ } if (encoding) free((void*)encoding); encoding = NULL; list = NULL; offst = NULL; } int MyThes::thInitialize(const char* idxpath, const char* datpath) { // open the index file FILE * pifile = fopen(idxpath,"r"); if (!pifile) { pifile = NULL; return 0; } // parse in encoding and index size */ char * wrd; wrd = (char *)calloc(1, MAX_WD_LEN); int len = readLine(pifile,wrd,MAX_WD_LEN); encoding = mythes_mystrdup(wrd); len = readLine(pifile,wrd,MAX_WD_LEN); int idxsz = atoi(wrd); // now allocate list, offst for the given size list = (char**) calloc(idxsz,sizeof(char*)); offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int)); if ( (!(list)) || (!(offst)) ) { fprintf(stderr,"Error - bad memory allocation\n"); fflush(stderr); return 0; } // now parse the remaining lines of the index len = readLine(pifile,wrd,MAX_WD_LEN); while (len > 0) { int np = mystr_indexOfChar(wrd,'|'); if (nw < idxsz) { if (np >= 0) { *(wrd+np) = '\0'; list[nw] = (char *)calloc(1,(np+1)); memcpy((list[nw]),wrd,np); offst[nw] = atoi(wrd+np+1); nw++; } } len = readLine(pifile,wrd,MAX_WD_LEN); } free((void *)wrd); fclose(pifile); pifile=NULL; /* next open the data file */ pdfile = fopen(datpath,"r"); if (!pdfile) { pdfile = NULL; return 0; } return 1; } int MyThes::thCleanup() { /* first close the data file */ if (pdfile) { fclose(pdfile); pdfile=NULL; } /* now free up all the allocated strings on the list */ for (int i=0; i < nw; i++) { if (list[i]) { free(list[i]); list[i] = 0; } } if (list) free((void*)list); if (offst) free((void*)offst); nw = 0; return 1; } // lookup text in index and count of meanings and a list of meaning entries // with each entry having a synonym count and pointer to an // array of char * (i.e the synonyms) // // note: calling routine should call CleanUpAfterLookup with the original // meaning point and count to properly deallocate memory int MyThes::Lookup(const char * pText, int len, mentry** pme) { *pme = NULL; // handle the case of missing file or file related errors if (! pdfile) return 0; long offset = 0; /* copy search word and make sure null terminated */ char * wrd = (char *) calloc(1,(len+1)); memcpy(wrd,pText,len); /* find it in the list */ int idx = binsearch(wrd,list,nw); free(wrd); if (idx < 0) return 0; // now seek to the offset offset = (long) offst[idx]; int rc = fseek(pdfile,offset,SEEK_SET); if (rc) { return 0; } // grab the count of the number of meanings // and allocate a list of meaning entries char * buf = NULL; buf = (char *) malloc( MAX_LN_LEN ); if (!buf) return 0; readLine(pdfile, buf, (MAX_LN_LEN-1)); int np = mystr_indexOfChar(buf,'|'); if (np < 0) { free(buf); return 0; } int nmeanings = atoi(buf+np+1); *pme = (mentry*) malloc( nmeanings * sizeof(mentry) ); if (!(*pme)) { free(buf); return 0; } // now read in each meaning and parse it to get defn, count and synonym lists mentry* pm = *(pme); char dfn[MAX_WD_LEN]; for (int j = 0; j < nmeanings; j++) { readLine(pdfile, buf, (MAX_LN_LEN-1)); pm->count = 0; pm->psyns = NULL; pm->defn = NULL; // store away the part of speech for later use char * p = buf; char * pos = NULL; np = mystr_indexOfChar(p,'|'); if (np >= 0) { *(buf+np) = '\0'; pos = mythes_mystrdup(p); p = p + np + 1; } else { pos = mythes_mystrdup(""); } // count the number of fields in the remaining line int nf = 1; char * d = p; np = mystr_indexOfChar(d,'|'); while ( np >= 0 ) { nf++; d = d + np + 1; np = mystr_indexOfChar(d,'|'); } pm->count = nf; pm->psyns = (char **) malloc(nf*sizeof(char*)); // fill in the synonym list d = p; for (int j = 0; j < nf; j++) { np = mystr_indexOfChar(d,'|'); if (np > 0) { *(d+np) = '\0'; pm->psyns[j] = mythes_mystrdup(d); d = d + np + 1; } else { pm->psyns[j] = mythes_mystrdup(d); } } // add pos to first synonym to create the definition int k = strlen(pos); int m = strlen(pm->psyns[0]); if ((k+m) < (MAX_WD_LEN - 1)) { strncpy(dfn,pos,k); *(dfn+k) = ' '; strncpy((dfn+k+1),(pm->psyns[0]),m+1); pm->defn = mythes_mystrdup(dfn); } else { pm->defn = mythes_mystrdup(pm->psyns[0]); } free(pos); pm++; } free(buf); return nmeanings; } void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings) { if (nmeanings == 0) return; if ((*pme) == NULL) return; mentry * pm = *pme; for (int i = 0; i < nmeanings; i++) { int count = pm->count; for (int j = 0; j < count; j++) { if (pm->psyns[j]) free(pm->psyns[j]); pm->psyns[j] = NULL; } if (pm->psyns) free(pm->psyns); pm->psyns = NULL; if (pm->defn) free(pm->defn); pm->defn = NULL; pm->count = 0; pm++; } pm = *pme; free(pm); *pme = NULL; return; } // read a line of text from a text file stripping // off the line terminator and replacing it with // a null string terminator. // returns: -1 on error or the number of characters in // in the returning string // A maximum of nc characters will be returned int MyThes::readLine(FILE * pf, char * buf, int nc) { if (fgets(buf,nc,pf)) { mythes_mychomp(buf); return strlen(buf); } return -1; } // performs a binary search on null terminated character // strings // // returns: -1 on not found // index of wrd in the list[] int MyThes::binsearch(char * sw, char* list[], int nlst) { int lp, up, mp, j, indx; lp = 0; up = nlst-1; indx = -1; if (nlst == 0) return -1; if (strcmp(sw,list[lp]) < 0) return -1; if (strcmp(sw,list[up]) > 0) return -1; while (indx < 0 ) { mp = (int)((lp+up) >> 1); j = strcmp(sw,list[mp]); if ( j > 0) { lp = mp + 1; } else if (j < 0 ) { up = mp - 1; } else { indx = mp; } if (lp > up) return -1; } return indx; } char * MyThes::get_th_encoding() { if (encoding) return encoding; return NULL; }