3111skyline.com - C Word Search File

Simple Word Searching within a File

Matching Whole Words

You can easily search for words or pharses within a file using the functions provided in <string.h>. Depending on how you want to control the matching, (e.g. whole words or any included substring match) you can either tokenize each line into individual words with strtok or simply use pointer orientation functions such as strstr or strchr and pointer arithmetic to work from the beginning to end of the line. The first example below uses strtok to tokenize each line into the separate words and compares each whole word against each search term with strcmp for a match limited to whole words.

Matching Terms as Substrings within Words

To preform a partial match within any word, (e.g. the in there) you can iterate over each character in each word calling strncmp using the length of each search term to limit the comparison within each word. That will provide partial matches anywhere within each word.

Yet at third approach is to iterate over each line using strstr to return a pointer to the begninning of any search term found within the line, process that match, set the pointer to the first character past the matched term and repeat until the end of line is reached. This method simplifies term location within the file, but adds additional code after the call if you want to limit your match to whole words.

Summary of Code Operation

validate and convert each argument to lower-case (search terms).
read each line of the input file with fgets.
check complete line is read by checking for '\n', set nl flag.
tokenize each line using strtok.
convert each token to lower-case and compare against terms with strcmp.
increment word.occ array fixing the line and the number of times the match occurs and increment word.freq to update the total frequency for the word in the file.
if full-line read (nl set), increment line count, repeat.
output the results for each search term.

Sample Code - Match Whole Words

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* constants for num & max terms, lines and chars */
enum { NTRM = 2, MAXT = 29, MAXL = 128, MAXC = 512 };

typedef struct {        /* search term struct */
    char term[MAXT];    /* search term */
    char occ[MAXL];     /* occurence per-line */
    size_t freq;        /* total occurences   */
} sterm;

char *str2lower (char *s);
FILE *xfopen (const char *fn, const char *mode);

int main (int argc, char **argv) {

    if (argc < NTRM + 1 ) { /* validate NTRM search terms given */
	fprintf (stderr, "error: insufficient input.\n"
                         "usage: %s term term [file (stdin)]\n",
                         argv[0]);
	return 1;
    }

    sterm word[NTRM] = {{ .term = "" }};     /* initialize vars */
    size_t i, j, n = 0;
    char *delim = " ,.;\t\n";
    char buf[MAXC] = "";
    FILE *fp = argc > 3 ? xfopen (argv[3], "r") : stdin;

    for (i = 0; i < NTRM; i++) {     /* fill word w/search terms */
        strncpy (word[i].term, argv[i+1], MAXT);    /* copy term */
        str2lower (word[i].term);            /* convert to lower */
    }

    while (n < MAXL && fgets (buf, MAXC, fp)) { /* for each line */
        int nl = strchr (buf, '\n') ? 1 : 0; /* check short-read */
        char *p = buf;   /* tokenize string and compare to terms */
        for (p = strtok (p, delim); p; p = strtok (NULL, delim))
            for (i = 0; i < NTRM; i++)
                if (!strcmp (word[i].term, str2lower (p)))
                    word[i].occ[n]++, word[i].freq++; /* add occ */
        if (nl) n++;                    /* increment line count  */
        else fprintf (stderr, "warning: short-read line: %zu\n", n);
    }
    if (fp != stdin) fclose (fp);     /* close file if not stdin */

    for (i = 0; i < NTRM; i++) {      /* print results */
        printf ("\n word '%s' appears %zu times\n",
                word[i].term, word[i].freq);
        for (j = 0; j < n; j++)
            if (word[i].occ[j])
                printf ("   %2d times in line %zu\n",
                        word[i].occ[j], j + 1);
    }
    putchar ('\n');
    return 0;
}

/** convert string to lowercase.
 *  returns string with all chars converted to lowercase.
 *  bit-6 is the case bit in 7-bit ASCII (1 = lowercase)
 */
char *str2lower (char *s)
{
    if (!s)  return NULL;
    if (!*s) return s;
    char *p = s;

    for (; *p; p++)
        if ('A' <= *p && *p <= 'Z')  /* if uppercase */
            *p |= (1 << 5); /* set case bit to '1' (lowercase) */

    return s;
}

/** fopen with error checking */
FILE *xfopen (const char *fn, const char *mode)
{
    if (!fn || !mode) exit (EXIT_FAILURE);
    FILE *fp = fopen (fn, mode);

    if (!fp) {
        fprintf (stderr, "xfopen() error: file open failed '%s'.\n", fn);
        exit (EXIT_FAILURE);
    }

    return fp;
}

Compile:

gcc -Wall -Wextra -finline-functions -Ofast -o srch_words srch_words.c

With relative simple changes to the comparson, e.g. using strncmp allows matching a substring within the current token using the length of the search term to limit the comparison, e.g.:

            if (!strncmp (word[i].term, str2lower(p), word[i].tlen))
                word[i].occ[n]++, word[i].freq++; /* add occ */

Example Input:

    $ nl -b a dat/damages.txt
     1  Personal injury damage awards are unliquidated
     2  and are not capable of certain measurement; thus, the
     3  jury has broad discretion in assessing the amount of
     4  damages in a personal injury case. Yet, at the same
     5  time, a factual sufficiency review insures that the
     6  evidence supports the jury's award; and, although
     7  difficult, the law requires appellate courts to conduct
     8  factual sufficiency reviews on damage awards in
     9  personal injury cases. Thus, while a jury has latitude in
    10  assessing intangible damages in personal injury cases,
    11  a jury's damage award does not escape the scrutiny of
    12  appellate review.
    13
    14  Because Texas law applies no physical manifestation
    15  rule to restrict wrongful death recoveries, a
    16  trial court in a death case is prudent when it chooses
    17  to submit the issues of mental anguish and loss of
    18  society and companionship. While there is a
    19  presumption of mental anguish for the wrongful death
    20  beneficiary, the Texas Supreme Court has not indicated
    21  that reviewing courts should presume that the mental
    22  anguish is sufficient to support a large award. Testimony
    23  that proves the beneficiary suffered severe mental
    24  anguish or severe grief should be a significant and
    25  sometimes determining factor in a factual sufficiency
    26  analysis of large non-pecuniary damage awards.

Use/Output:

Searching for awards and texas through the passage above yeilds the folowing:

    $ ./bin/srch_words awards texas <dat/damages.txt

     word 'awards' appears 3 times
        1 times in line 1
        1 times in line 8
        1 times in line 26

     word 'texas' appears 2 times
        1 times in line 14
        1 times in line 20

changing the search terms to of and the to capture a couple of the more frequent words provides the following analysis

    $ ./bin/srch_words of the <dat/damages.txt

     word 'of' appears 7 times
        1 times in line 2
        1 times in line 3
        1 times in line 11
        2 times in line 17
        1 times in line 19
        1 times in line 26

     word 'the' appears 12 times
        1 times in line 2
        1 times in line 3
        1 times in line 4
        1 times in line 5
        1 times in line 6
        1 times in line 7
        1 times in line 11
        1 times in line 17
        1 times in line 19
        1 times in line 20
        1 times in line 21
        1 times in line 23

Matching Partial Words

The changes required for partial matches using strtok to tokenize each line are shown above. Below is the code for finding partial matches using strstr and pointer arithmetic rather than tokenizing each line. Recall strstr searches for the first occurrence of a substring (term) within a string, returing a pointer to the first character in substring if found, NULL otherwise. By repeatedly calling strstr while the return is not NULL, processing the match, and updating the line-pointer to point to the next character after the current substring before calling strstr again, you can walk-the-pointer down the line finding each occurrence of a given term.

Summary of Code Operation

validate and convert each argument to lower-case (search terms).
read each line of the input file with fgets.
check complete line is read by checking for '\n', set nl flag.
convert line to lower-case
for each term, search within each line for the term using strstr.
if match, update the start-pointer to point past the current term in the line.
increment word.occ array fixing the line and the number of times the match occurs and increment word.freq to update the total frequency for the word in the file.
if full-line read (nl set), increment line count, repeat.
output the results for each search term.

Sample Code - Match Partial Words

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* constants for num & max terms, lines and chars */
enum { NTRM = 2, MAXT = 29, MAXL = 128, MAXC = 512 };

typedef struct {        /* search term struct */
    char term[MAXT];    /* search term */
    char occ[MAXL];     /* occurence per-line */
    size_t tlen;        /* search term length */
    size_t freq;        /* total occurences   */
} sterm;

char *str2lower (char *s);
FILE *xfopen (const char *fn, const char *mode);

int main (int argc, char **argv) {

    if (argc < NTRM + 1 ) {  /* validate NTRM search terms given */
	fprintf (stderr, "error: insufficient input.\n"
                         "usage: %s term term [file (stdin)]\n",
                         argv[0]);
	return 1;
    }

    sterm word[NTRM] = {{ .term = "" }};      /* initialize vars */
    size_t i, j, n = 0;
    char buf[MAXC] = "";
    FILE *fp = argc > 3 ? xfopen (argv[3], "r") : stdin;

    for (i = 0; i < NTRM; i++) {
        strncpy (word[i].term, argv[i+1], MAXT);   /* copy terms */
        word[i].tlen = strlen (word[i].term);      /* get length */
        str2lower (word[i].term);            /* convert to lower */
    }

    while (n < MAXL && fgets (buf, MAXC, fp))  /* read each line */
    {
        int nl = strchr (buf, '\n') ? 1 : 0; /* check short-read */
        char *p = str2lower(buf);            /* convert to lower */

        for (i = 0; i < NTRM; i++)  {    /* for each search term */
            char *ep, *sp;              /* check for term in buf */
            for (sp = p; (ep = strstr (sp, word[i].term));
                 sp = ep + word[i].tlen)
                word[i].occ[n]++, word[i].freq++; /* update freq */
        }
        if (nl)
            n++;  /* increment line count */
        else
            fprintf (stderr, "warning: short-read line: %zu\n", n);
    }
    if (fp != stdin) fclose (fp);     /* close file if not stdin */

    for (i = 0; i < NTRM; i++) {      /* print results */
        printf ("\n word '%s' appears %zu times\n",
                word[i].term, word[i].freq);
        for (j = 0; j < n; j++)
            if (word[i].occ[j])
                printf ("   %2d times in line %zu\n",
                        word[i].occ[j], j + 1);
    }
    putchar ('\n');

    return 0;
}

/** convert string to lowercase.
 *  returns string with all chars converted to lowercase.
 *  bit-6 is the case bit in 7-bit ASCII (1 = lowercase)
 */
char *str2lower (char *s)
{
    if (!s)  return NULL;
    if (!*s) return s;
    char *p = s;

    for (; *p; p++)
        if ('A' <= *p && *p <= 'Z')  /* if uppercase */
            *p |= (1 << 5); /* set case bit to '1' (lowercase) */

    return s;
}

/** fopen with error checking */
FILE *xfopen (const char *fn, const char *mode)
{
    if (!fn || !mode) exit (EXIT_FAILURE);
    FILE *fp = fopen (fn, mode);

    if (!fp) {
        fprintf (stderr, "xfopen() error: file open failed '%s'.\n", fn);
        exit (EXIT_FAILURE);
    }

    return fp;
}

Use/Output:

    $ ./bin/srch_words_part of the <dat/damages.txt

    $  word 'of' appears, 7 times
        1 times in line 2
        1 times in line 3
        1 times in line 11
        2 times in line 17
        1 times in line 19
        1 times in line 26

     word 'the' appears, 13 times
        1 times in line 2
        1 times in line 3
        1 times in line 4
        1 times in line 5
        1 times in line 6
        1 times in line 7
        1 times in line 11
        1 times in line 17
        1 times in line 18
        1 times in line 19
        1 times in line 20
        1 times in line 21
        1 times in line 23

C References

Tutorials

Libraries

Code Tools

Comments/Bugs

Simple Word Searching within a File

Matching Whole Words

Matching Terms as Substrings within Words

Summary of Code Operation

Sample Code - Match Whole Words

Compile:

Example Input:

Use/Output:

Matching Partial Words

Summary of Code Operation

Sample Code - Match Partial Words

Use/Output:

As usual, this page is under-construction