1. Homepage of Dr. Zoltán Porkoláb
    1. Home
    2. Archive
  2. Teaching
    1. Timetable
    2. Multiparadigm programming (MSc)
    3. C programming (BSc for physicists)
    4. Project tools (BSc)
    5. Bolyai College
    6. C++ (for foreign studenst)
    7. Software technology lab
    8. BSc and MSc thesis
  3. Research
    1. Templight
    2. CodeChecker
    3. CodeCompass
    4. Projects
    5. Publications (up to 2011)
    6. PhD students
  4. Affiliations
    1. Dept. of Programming Languages and Compilers
    2. Ericsson Hungary Ltd

The C programming language – Lecture 13.

Implementation of a UNIX filter

We will implement a grep like UNIX utility. Such commands are frequenty read input files line by line and do an action on them. We can specify multiple files, and the command will repeat the action on each files. If we do not specify file, the command reads standard input until EOF. Such commands are called filters in UNIX.

course we will simplify the grep command, we will not implement regular expressions and many standard flags.

This is how we can use our command:

$ gr pattern file1 file2 file3

This will search the parren pattern in file1, file2, and file3 and pront all lines where the pattern occurs.

We can specify flags in the command, like -i or -v to modify the default behavour.

$ gr -iv pattern file1 file2 file3

Flag -v will print only those lines where there was no matching.

Flag -i means that we ignore case sensitivity on the matching.

Flag -w will select only those lines containing matches that form whole words, i.e. the pattern is separated from the environment.

The precise definition of grep can be found here.

We will go on this project step by step.

The arguments of the main function

The program should be called with at least 1 command line parameter: the pattern. Therefore we will check argc: the number of parameters is always at least 1: argv[0] is the name of the program always given. If further parameters are given, then argc is greater than 1.

If argc < 2 we will print the correct usage with the usage() function. Usage receives the actual program name (argv[0]) as parameter. We finish the program with the exit function declared in stdlib.h.

The gr function reads one file (or stdin) until end-of-file (EOF) from in line by line. (The maximum lenght of a line is limited by BUFSIZE). Each line is checked against the p pattern. The matching is true, if the strstr function (declared in string.h) is returning true. In this case we print the whole line to the output file (out).

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFSIZE 1024

void usage( char *prname)
{
  fprintf( stderr, "Usage: %s pattern [files...]\n", prname); 
  exit(1);
}
void gr( char *p, FILE *in, FILE *out)
{
  char buffer[BUFSIZE];
  while ( NULL != fgets( buffer, BUFSIZE, in) )
  {
    int is_match = ( NULL != strstr( buffer, p) );

    if ( is_match )
    {
      fputs( buffer, out);
    }
  }
}
int main( int argc, char *argv[])
{
  int i;
  if ( argc < 2 )
  {
    usage(argv[0]);
  }  
  if ( 2 == argc )
  {
    gr( argv[1], stdin, stdout);
  }
  else
  {
    for ( i = 2; i < argc; ++i)
    {
      FILE *fp = fopen( argv[i], "r");
      if ( NULL != fp )
      {
        gr( argv[1], fp, stdout);
        fclose(fp);
      }
      else
      {
        fprintf( stderr, "Can't open %s for read\n", argv[i]);
      }
    }
  }
  return 0;
}

We test the program without patterns:

$ ./a.out 
Usage: ./a.out pattern [files...]

…with pattern reading the input from standard input:

$ ./a.out if < gr.c 
    if ( is_match )
  if ( argc < 2 )
  if ( 2 == argc )
      if ( NULL != fp )

…and with file parameters:

$ ./a.out if gr.c gr.c
    if ( is_match )
  if ( argc < 2 )
  if ( 2 == argc )
      if ( NULL != fp )
    if ( is_match )
  if ( argc < 2 )
  if ( 2 == argc )
      if ( NULL != fp )

Handling of the command line flags

The next step will be teh handling of the parameters. We will do this in a separate function do_params(). Thsis function will receive argc and argv as input and fill a struct param_s structure into the _params. We also handle and store the pattern parameter here. The function will return the number of next argument after the flags, where the filename parameters as the following arguments start (if they were given). We implicitly suppose that flags preceed the pattern and file parameters.

We do not change the matching now, we just test the flags in main.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define BUFSIZE 1024

struct param_s
{
  int  iflag;     /* case insensitive on */
  int  vflag;     /* negation on         */
  int  wflag;     /* word regex on       */
  char *pattern;  /* pattern to search   */
  char *upattern; /* upper case pattern  */
};

void usage( char *prname)
{
  fprintf( stderr, "Usage: %s pattern [files...]\n", prname); 
  exit(1);
}
int do_params( struct param_s *p, int argc, char *argv[])
{
  int i = 1;

  p->iflag = 0;
  p->vflag = 0;
  p->wflag = 0;

  /* letter flags first */
  while ( i < argc  &&  '-' == argv[i][0] )
  {
    int j = 1;
    while ( '\0' != argv[i][j] )
    {
      switch(argv[i][j])
      {
      case 'i': p->iflag = 1; break;
      case 'v': p->vflag = 1; break;
      case 'w': p->wflag = 1; break;
      default : fprintf( stderr, "Invalid flag: %c\n", argv[i][j]);
                usage(argv[0]); /* invalid flag: exit() */
      }
      ++j;
    }
    ++i;
  }
  /* end of flags, pattern should come here */
  if ( i >= argc )
  {
    fprintf( stderr, "No pattern was given\n");
    usage(argv[0]);      /* no pattern: exit() */
  }
  p->pattern = argv[i];   /* ok, pattern found */

  if ( p->iflag )  /* case insensitive */
  {
    int k;
    p->upattern = (char *) malloc(strlen(p->pattern)+1);
    for ( k = 0; k < strlen(p->pattern)+1; ++k)
    {
      p->upattern[k] = toupper(p->pattern[k]);
    }
  }
  return ++i;  /* continue from next parameter */
}
void gr( struct param_s *p, FILE *in, FILE *out)
{
  char buffer[BUFSIZE];
  while ( NULL != fgets( buffer, BUFSIZE, in) )
  {
    int is_match = ( NULL != strstr( buffer, p->pattern) );

    if ( is_match )
    {
      fputs( buffer, out);
    }
  }
}
int main( int argc, char *argv[])
{
  struct param_s params;
  int i = do_params( &params, argc, argv);

  fprintf( stderr, "iflag    = %d\n", params.iflag);
  fprintf( stderr, "vflag    = %d\n", params.vflag);
  fprintf( stderr, "wflag    = %d\n", params.wflag);
  fprintf( stderr, "pattern  = %s\n", params.pattern);
  fprintf( stderr, "upattern = %s\n", params.upattern);
  fprintf( stderr, "i == %d\n", i);

  if ( i == argc )
  {
    gr( &params, stdin, stdout);
  }
  else
  {
    for ( ; i < argc; ++i)
    {
      FILE *fp = fopen( argv[i], "r");
      if ( NULL != fp )
      {
        gr( &params, fp, stdout);
        fclose(fp);
      }
      else
      {
        fprintf( stderr, "Can't open %s for read\n", argv[i]);
      }
    }
  }
  return 0;
}

We test the program with valid and invalid flags:

$ ./a.out xyz < gr.c
iflag    = 0
vflag    = 0
wflag    = 0
pattern  = pattern
upattern = (null)
i == 2
$ ./a.out -i -v pattern
iflag    = 1
vflag    = 1
wflag    = 0
pattern  = pattern
upattern = PATTERN
i == 4
$ ./a.out -iw -v pattern
iflag    = 1
vflag    = 1
wflag    = 1
pattern  = pattern
upattern = PATTERN
i == 4
$ ./a.out -xv pattern
Invalid flag: x
Usage: ./a.out pattern [files...]

The complete program

Finally we improve the gr function for the required flags.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
 * gr.c -- a simple grep-like program
 * usage: gr [-ivw] pattern [files...]
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define BUFSIZE 1024

struct param_s
{
  int  iflag;     /* case insensitive on */
  int  vflag;     /* negation on         */
  int  wflag;     /* word regex on       */
  char *pattern;  /* pattern to search   */
  char *upattern; /* upper case pattern  */
};

void usage( char *prname)
{
  fprintf( stderr, "Usage: %s [-ivw] pattern [files...]\n", prname); 
  exit(1);
}

int do_params( struct param_s *p, int argc, char *argv[])
{
  int i = 1;

  p->iflag = 0;
  p->vflag = 0;
  p->wflag = 0;

  /* letter flags first */
  while ( i < argc  &&  '-' == argv[i][0] )
  {
    int j = 1;
    while ( '\0' != argv[i][j] )
    {
      switch(argv[i][j])
      {
      case 'i': p->iflag = 1; break;
      case 'v': p->vflag = 1; break;
      case 'w': p->wflag = 1; break;
      default : fprintf( stderr, "Invalid flag: %c\n", argv[i][j]);
                usage(argv[0]); /* invalid flag: exit() */
      }
      ++j;
    }
    ++i;
  }
  /* end of flags, pattern should come here */
  if ( i >= argc )
  {
    fprintf( stderr, "No pattern was given\n");
    usage(argv[0]);      /* no pattern: exit() */
  }
  p->pattern = argv[i];   /* ok, pattern found */

  if ( p->iflag )  /* case insensitive */
  {
    int k;
    p->upattern = (char *) malloc(strlen(p->pattern)+1);
    for ( k = 0; k < strlen(p->pattern)+1; ++k)
    {
      p->upattern[k] = toupper(p->pattern[k]);
    }
  }
  return ++i;  /* continue from next parameter */
}

int is_delim( char ch)
{
  return !( isalpha(ch) || isdigit(ch) || '_' == ch );
}

int wmatch( char *buffer, char *pattern, char *where)
{
  char *before = where - 1;
  char *after =  where + strlen(pattern);

  return (  where == buffer  &&  is_delim(*after) ) ||
         (  is_delim(*before)  &&  '\n' == *after ) ||
         (  is_delim(*before) && is_delim(*after) );
}
 
void gr( struct param_s *p, FILE *in, FILE *out)
{
  char buffer[BUFSIZE];
  while ( NULL != fgets( buffer, BUFSIZE, in) )
  {
    int is_match  = 0; /* true if matches */
    char *where   = 0; /* pointer to beginning of matching */

    if ( p->iflag )
    {
      char ubuffer[BUFSIZE];
      int k;
      for ( k = 0; k < strlen(buffer)+1; ++k)
      {
        ubuffer[k] = toupper(buffer[k]);
      }
      is_match = ( NULL != (where = strstr( ubuffer, p->upattern)) );
      if ( is_match && p->wflag )
        is_match = wmatch( ubuffer, p->upattern, where);
    }
    else
    {
      is_match = ( NULL != (where = strstr( buffer, p->pattern)) );
      if ( is_match && p->wflag )
        is_match = wmatch( buffer, p->pattern, where);
    }
  
    if ( p->vflag )
    {
      is_match = ! is_match;
    }

    if ( is_match )
    {
      fputs( buffer, out);
    }
  }
}


int main( int argc, char *argv[])
{
  struct param_s params;
  int i = do_params( &params, argc, argv);

#ifdef DEBUG
  fprintf( stderr, "iflag    = %d\n", params.iflag);
  fprintf( stderr, "vflag    = %d\n", params.vflag);
  fprintf( stderr, "wflag    = %d\n", params.wflag);
  fprintf( stderr, "pattern  = %s\n", params.pattern);
  fprintf( stderr, "upattern = %s\n", params.upattern);
  fprintf( stderr, "i == %d\n", i);
#endif /* DEBUG */

  if ( i == argc )
  {
    gr( &params, stdin, stdout);
  }
  else
  {
    for ( ; i < argc; ++i)
    {
      FILE *fp = fopen( argv[i], "r");
      if ( NULL != fp )
      {
        gr( &params, fp, stdout);
        fclose(fp);
      }
      else
      {
        fprintf( stderr, "Can't open %s for read\n", argv[i]);
      }
    }
  }
  return 0;
}