Simple Word Searching within a File
Matching Whole Words
You can easily search for words or pharses within a file using the functions provided in <string.h>
. Depending on how you want to control the matching, (e.g. whole words or any included substring match) you can either tokenize each line into individual words with strtok
or simply use pointer orientation functions such as strstr
or strchr
and pointer arithmetic to work from the beginning to end of the line. The first example below uses strtok
to tokenize each line into the separate words and compares each whole word against each search term with strcmp
for a match limited to whole words.
Matching Terms as Substrings within Words
To preform a partial match within any word, (e.g. the in there) you can iterate over each character in each word calling strncmp
using the length of each search term to limit the comparison within each word. That will provide partial matches anywhere within each word.
Yet at third approach is to iterate over each line using strstr
to return a pointer to the begninning of any search term found within the line, process that match, set the pointer to the first character past the matched term and repeat until the end of line is reached. This method simplifies term location within the file, but adds additional code after the call if you want to limit your match to whole words.
Summary of Code Operation
- validate and convert each argument to lower-case (search terms).
- read each line of the input file with
fgets
. - check complete line is read by checking for
'\n'
, setnl
flag. - tokenize each line using
strtok
. - convert each token to lower-case and compare against terms with
strcmp
. - increment
word.occ
array fixing the line and the number of times the match occurs and incrementword.freq
to update the total frequency for the word in the file. - if full-line read (
nl
set), increment line count, repeat. - output the results for each search term.
Sample Code - Match Whole Words
#include <stdio.h> #include <stdlib.h> #include <string.h> /* constants for num & max terms, lines and chars */ enum { NTRM = 2, MAXT = 29, MAXL = 128, MAXC = 512 }; typedef struct { /* search term struct */ char term[MAXT]; /* search term */ char occ[MAXL]; /* occurence per-line */ size_t freq; /* total occurences */ } sterm; char *str2lower (char *s); FILE *xfopen (const char *fn, const char *mode); int main (int argc, char **argv) { if (argc < NTRM + 1 ) { /* validate NTRM search terms given */ fprintf (stderr, "error: insufficient input.\n" "usage: %s term term [file (stdin)]\n", argv[0]); return 1; } sterm word[NTRM] = {{ .term = "" }}; /* initialize vars */ size_t i, j, n = 0; char *delim = " ,.;\t\n"; char buf[MAXC] = ""; FILE *fp = argc > 3 ? xfopen (argv[3], "r") : stdin; for (i = 0; i < NTRM; i++) { /* fill word w/search terms */ strncpy (word[i].term, argv[i+1], MAXT); /* copy term */ str2lower (word[i].term); /* convert to lower */ } while (n < MAXL && fgets (buf, MAXC, fp)) { /* for each line */ int nl = strchr (buf, '\n') ? 1 : 0; /* check short-read */ char *p = buf; /* tokenize string and compare to terms */ for (p = strtok (p, delim); p; p = strtok (NULL, delim)) for (i = 0; i < NTRM; i++) if (!strcmp (word[i].term, str2lower (p))) word[i].occ[n]++, word[i].freq++; /* add occ */ if (nl) n++; /* increment line count */ else fprintf (stderr, "warning: short-read line: %zu\n", n); } if (fp != stdin) fclose (fp); /* close file if not stdin */ for (i = 0; i < NTRM; i++) { /* print results */ printf ("\n word '%s' appears %zu times\n", word[i].term, word[i].freq); for (j = 0; j < n; j++) if (word[i].occ[j]) printf (" %2d times in line %zu\n", word[i].occ[j], j + 1); } putchar ('\n'); return 0; } /** convert string to lowercase. * returns string with all chars converted to lowercase. * bit-6 is the case bit in 7-bit ASCII (1 = lowercase) */ char *str2lower (char *s) { if (!s) return NULL; if (!*s) return s; char *p = s; for (; *p; p++) if ('A' <= *p && *p <= 'Z') /* if uppercase */ *p |= (1 << 5); /* set case bit to '1' (lowercase) */ return s; } /** fopen with error checking */ FILE *xfopen (const char *fn, const char *mode) { if (!fn || !mode) exit (EXIT_FAILURE); FILE *fp = fopen (fn, mode); if (!fp) { fprintf (stderr, "xfopen() error: file open failed '%s'.\n", fn); exit (EXIT_FAILURE); } return fp; }
Compile:
gcc -Wall -Wextra -finline-functions -Ofast -o srch_words srch_words.c
With relative simple changes to the comparson, e.g. using strncmp allows matching a substring within the current token using the length of the search term to limit the comparison, e.g.:
if (!strncmp (word[i].term, str2lower(p), word[i].tlen)) word[i].occ[n]++, word[i].freq++; /* add occ */
Example Input:
$ nl -b a dat/damages.txt 1 Personal injury damage awards are unliquidated 2 and are not capable of certain measurement; thus, the 3 jury has broad discretion in assessing the amount of 4 damages in a personal injury case. Yet, at the same 5 time, a factual sufficiency review insures that the 6 evidence supports the jury's award; and, although 7 difficult, the law requires appellate courts to conduct 8 factual sufficiency reviews on damage awards in 9 personal injury cases. Thus, while a jury has latitude in 10 assessing intangible damages in personal injury cases, 11 a jury's damage award does not escape the scrutiny of 12 appellate review. 13 14 Because Texas law applies no physical manifestation 15 rule to restrict wrongful death recoveries, a 16 trial court in a death case is prudent when it chooses 17 to submit the issues of mental anguish and loss of 18 society and companionship. While there is a 19 presumption of mental anguish for the wrongful death 20 beneficiary, the Texas Supreme Court has not indicated 21 that reviewing courts should presume that the mental 22 anguish is sufficient to support a large award. Testimony 23 that proves the beneficiary suffered severe mental 24 anguish or severe grief should be a significant and 25 sometimes determining factor in a factual sufficiency 26 analysis of large non-pecuniary damage awards.
Use/Output:
Searching for awards and texas through the passage above yeilds the folowing:
$ ./bin/srch_words awards texas <dat/damages.txt word 'awards' appears 3 times 1 times in line 1 1 times in line 8 1 times in line 26 word 'texas' appears 2 times 1 times in line 14 1 times in line 20
changing the search terms to of and the to capture a couple of the more frequent words provides the following analysis
$ ./bin/srch_words of the <dat/damages.txt word 'of' appears 7 times 1 times in line 2 1 times in line 3 1 times in line 11 2 times in line 17 1 times in line 19 1 times in line 26 word 'the' appears 12 times 1 times in line 2 1 times in line 3 1 times in line 4 1 times in line 5 1 times in line 6 1 times in line 7 1 times in line 11 1 times in line 17 1 times in line 19 1 times in line 20 1 times in line 21 1 times in line 23
Matching Partial Words
The changes required for partial matches using strtok
to tokenize each line are shown above. Below is the code for finding partial matches using strstr
and pointer arithmetic rather than tokenizing each line. Recall strstr
searches for the first occurrence of a substring (term) within a string, returing a pointer to the first character in substring if found, NULL
otherwise. By repeatedly calling strstr
while the return is not NULL
, processing the match, and updating the line-pointer to point to the next character after the current substring before calling strstr
again, you can walk-the-pointer down the line finding each occurrence of a given term.
Summary of Code Operation
- validate and convert each argument to lower-case (search terms).
- read each line of the input file with
fgets
. - check complete line is read by checking for
'\n'
, setnl
flag. - convert line to lower-case
- for each term, search within each line for the term using
strstr
. - if match, update the start-pointer to point past the current term in the line.
- increment
word.occ
array fixing the line and the number of times the match occurs and incrementword.freq
to update the total frequency for the word in the file. - if full-line read (
nl
set), increment line count, repeat. - output the results for each search term.
Sample Code - Match Partial Words
#include <stdio.h> #include <stdlib.h> #include <string.h> /* constants for num & max terms, lines and chars */ enum { NTRM = 2, MAXT = 29, MAXL = 128, MAXC = 512 }; typedef struct { /* search term struct */ char term[MAXT]; /* search term */ char occ[MAXL]; /* occurence per-line */ size_t tlen; /* search term length */ size_t freq; /* total occurences */ } sterm; char *str2lower (char *s); FILE *xfopen (const char *fn, const char *mode); int main (int argc, char **argv) { if (argc < NTRM + 1 ) { /* validate NTRM search terms given */ fprintf (stderr, "error: insufficient input.\n" "usage: %s term term [file (stdin)]\n", argv[0]); return 1; } sterm word[NTRM] = {{ .term = "" }}; /* initialize vars */ size_t i, j, n = 0; char buf[MAXC] = ""; FILE *fp = argc > 3 ? xfopen (argv[3], "r") : stdin; for (i = 0; i < NTRM; i++) { strncpy (word[i].term, argv[i+1], MAXT); /* copy terms */ word[i].tlen = strlen (word[i].term); /* get length */ str2lower (word[i].term); /* convert to lower */ } while (n < MAXL && fgets (buf, MAXC, fp)) /* read each line */ { int nl = strchr (buf, '\n') ? 1 : 0; /* check short-read */ char *p = str2lower(buf); /* convert to lower */ for (i = 0; i < NTRM; i++) { /* for each search term */ char *ep, *sp; /* check for term in buf */ for (sp = p; (ep = strstr (sp, word[i].term)); sp = ep + word[i].tlen) word[i].occ[n]++, word[i].freq++; /* update freq */ } if (nl) n++; /* increment line count */ else fprintf (stderr, "warning: short-read line: %zu\n", n); } if (fp != stdin) fclose (fp); /* close file if not stdin */ for (i = 0; i < NTRM; i++) { /* print results */ printf ("\n word '%s' appears %zu times\n", word[i].term, word[i].freq); for (j = 0; j < n; j++) if (word[i].occ[j]) printf (" %2d times in line %zu\n", word[i].occ[j], j + 1); } putchar ('\n'); return 0; } /** convert string to lowercase. * returns string with all chars converted to lowercase. * bit-6 is the case bit in 7-bit ASCII (1 = lowercase) */ char *str2lower (char *s) { if (!s) return NULL; if (!*s) return s; char *p = s; for (; *p; p++) if ('A' <= *p && *p <= 'Z') /* if uppercase */ *p |= (1 << 5); /* set case bit to '1' (lowercase) */ return s; } /** fopen with error checking */ FILE *xfopen (const char *fn, const char *mode) { if (!fn || !mode) exit (EXIT_FAILURE); FILE *fp = fopen (fn, mode); if (!fp) { fprintf (stderr, "xfopen() error: file open failed '%s'.\n", fn); exit (EXIT_FAILURE); } return fp; }
Use/Output:
$ ./bin/srch_words_part of the <dat/damages.txt $ word 'of' appears, 7 times 1 times in line 2 1 times in line 3 1 times in line 11 2 times in line 17 1 times in line 19 1 times in line 26 word 'the' appears, 13 times 1 times in line 2 1 times in line 3 1 times in line 4 1 times in line 5 1 times in line 6 1 times in line 7 1 times in line 11 1 times in line 17 1 times in line 18 1 times in line 19 1 times in line 20 1 times in line 21 1 times in line 23