xref: /illumos-gate/usr/src/cmd/vgrind/regexp.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  */
6 
7 #pragma ident	"%Z%%M%	%I%	%E% SMI"
8 	  /* from UCB 5.1 (Berkeley) 6/5/85 */
9 
10 #include <ctype.h>
11 
12 typedef int	boolean;
13 #define TRUE	1
14 #define FALSE	0
15 #define NIL	0
16 
17 extern boolean	l_onecase;	/* true if upper and lower equivalent */
18 extern char	*l_idchars;	/* set of characters legal in identifiers
19 				   in addition to letters and digits */
20 
21 extern char	*strchr();
22 
23 #define isidchr(c)	\
24 		(isalnum(c) || ((c) != NIL && strchr(l_idchars, (c)) != NIL))
25 #define makelower(c)	(isupper((c)) ? tolower((c)) : (c))
26 
27 /*  STRNCMP -	like strncmp except that we convert the
28  *	 	first string to lower case before comparing
29  *		if l_onecase is set.
30  */
31 
32 STRNCMP(s1, s2, len)
33 	register char *s1,*s2;
34 	register int len;
35 {
36 	if (l_onecase) {
37 	    do
38 		if (*s2 - makelower(*s1))
39 			return (*s2 - makelower(*s1));
40 		else {
41 			s2++;
42 			s1++;
43 		}
44 	    while (--len);
45 	} else {
46 	    do
47 		if (*s2 - *s1)
48 			return (*s2 - *s1);
49 		else {
50 			s2++;
51 			s1++;
52 		}
53 	    while (--len);
54 	}
55 	return(0);
56 }
57 
58 /*	The following routine converts an irregular expression to
59  *	internal format.
60  *
61  *	Either meta symbols (\a \d or \p) or character strings or
62  *	operations ( alternation or parenthesizing ) can be
63  *	specified.  Each starts with a descriptor byte.  The descriptor
64  *	byte has STR set for strings, META set for meta symbols
65  *	and OPER set for operations.
66  *	The descriptor byte can also have the OPT bit set if the object
67  *	defined is optional.  Also ALT can be set to indicate an alternation.
68  *
69  *	For metasymbols the byte following the descriptor byte identities
70  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
71  *	strings the byte after the descriptor is a character count for
72  *	the string:
73  *
74  *		meta symbols := descriptor
75  *				symbol
76  *
77  *		strings :=	descriptor
78  *				character count
79  *				the string
80  *
81  *		operations :=	descriptor
82  *				symbol
83  *				character count
84  */
85 
86 /*
87  *  handy macros for accessing parts of match blocks
88  */
89 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
90 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
91 
92 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
93 #define OCNT(A) (*(A+2))	/* character count */
94 #define ONEXT(A) (A+3)		/* next character after the operation */
95 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
96 
97 #define SCNT(A) (*(A+1))	/* byte count of a string */
98 #define SSTR(A) (A+2)		/* address of the string */
99 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
100 
101 /*
102  *  bit flags in the descriptor
103  */
104 #define OPT 1
105 #define STR 2
106 #define META 4
107 #define ALT 8
108 #define OPER 16
109 
110 char *ure;		/* pointer current position in unconverted exp */
111 char *ccre;		/* pointer to current position in converted exp*/
112 char *malloc();
113 
114 char *
115 convexp(re)
116     char *re;		/* unconverted irregular expression */
117 {
118     register char *cre;		/* pointer to converted regular expression */
119 
120     /* allocate room for the converted expression */
121     if (re == NIL)
122 	return (NIL);
123     if (*re == '\0')
124 	return (NIL);
125     cre = malloc (4 * strlen(re) + 3);
126     ccre = cre;
127     ure = re;
128 
129     /* start the conversion with a \a */
130     *cre = META | OPT;
131     MSYM(cre) = 'a';
132     ccre = MNEXT(cre);
133 
134     /* start the conversion (its recursive) */
135     expconv ();
136     *ccre = 0;
137     return (cre);
138 }
139 
140 expconv()
141 {
142     register char *cs;		/* pointer to current symbol in converted exp */
143     register char c;		/* character being processed */
144     register char *acs;		/* pinter to last alternate */
145     register int temp;
146 
147     /* let the conversion begin */
148     acs = NIL;
149     cs = NIL;
150     while (*ure != NIL) {
151 	switch (c = *ure++) {
152 
153 	case '\\':
154 	    switch (c = *ure++) {
155 
156 	    /* escaped characters are just characters */
157 	    default:
158 		if (cs == NIL || (*cs & STR) == 0) {
159 		    cs = ccre;
160 		    *cs = STR;
161 		    SCNT(cs) = 1;
162 		    ccre += 2;
163 		} else
164 		    SCNT(cs)++;
165 		*ccre++ = c;
166 		break;
167 
168 	    /* normal(?) metacharacters */
169 	    case 'a':
170 	    case 'd':
171 	    case 'e':
172 	    case 'p':
173 		if (acs != NIL && acs != cs) {
174 		    do {
175 			temp = OCNT(acs);
176 			OCNT(acs) = ccre - acs;
177 			acs -= temp;
178 		    } while (temp != 0);
179 		    acs = NIL;
180 		}
181 		cs = ccre;
182 		*cs = META;
183 		MSYM(cs) = c;
184 		ccre = MNEXT(cs);
185 		break;
186 	    }
187 	    break;
188 
189 	/* just put the symbol in */
190 	case '^':
191 	case '$':
192 	    if (acs != NIL && acs != cs) {
193 		do {
194 		    temp = OCNT(acs);
195 		    OCNT(acs) = ccre - acs;
196 		    acs -= temp;
197 		} while (temp != 0);
198 		acs = NIL;
199 	    }
200 	    cs = ccre;
201 	    *cs = META;
202 	    MSYM(cs) = c;
203 	    ccre = MNEXT(cs);
204 	    break;
205 
206 	/* mark the last match sequence as optional */
207 	case '?':
208 	    if (cs)
209 	    	*cs = *cs | OPT;
210 	    break;
211 
212 	/* recurse and define a subexpression */
213 	case '(':
214 	    if (acs != NIL && acs != cs) {
215 		do {
216 		    temp = OCNT(acs);
217 		    OCNT(acs) = ccre - acs;
218 		    acs -= temp;
219 		} while (temp != 0);
220 		acs = NIL;
221 	    }
222 	    cs = ccre;
223 	    *cs = OPER;
224 	    OSYM(cs) = '(';
225 	    ccre = ONEXT(cs);
226 	    expconv ();
227 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
228 	    break;
229 
230 	/* return from a recursion */
231 	case ')':
232 	    if (acs != NIL) {
233 		do {
234 		    temp = OCNT(acs);
235 		    OCNT(acs) = ccre - acs;
236 		    acs -= temp;
237 		} while (temp != 0);
238 		acs = NIL;
239 	    }
240 	    cs = ccre;
241 	    *cs = META;
242 	    MSYM(cs) = c;
243 	    ccre = MNEXT(cs);
244 	    return;
245 
246 	/* mark the last match sequence as having an alternate */
247 	/* the third byte will contain an offset to jump over the */
248 	/* alternate match in case the first did not fail */
249 	case '|':
250 	    if (acs != NIL && acs != cs)
251 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
252 	    else
253 		OCNT(ccre) = 0;
254 	    *cs |= ALT;
255 	    cs = ccre;
256 	    *cs = OPER;
257 	    OSYM(cs) = '|';
258 	    ccre = ONEXT(cs);
259 	    acs = cs;	/* remember that the pointer is to be filles */
260 	    break;
261 
262 	/* if its not a metasymbol just build a scharacter string */
263 	default:
264 	    if (cs == NIL || (*cs & STR) == 0) {
265 		cs = ccre;
266 		*cs = STR;
267 		SCNT(cs) = 1;
268 		ccre = SSTR(cs);
269 	    } else
270 		SCNT(cs)++;
271 	    *ccre++ = c;
272 	    break;
273 	}
274     }
275     if (acs != NIL) {
276 	do {
277 	    temp = OCNT(acs);
278 	    OCNT(acs) = ccre - acs;
279 	    acs -= temp;
280 	} while (temp != 0);
281 	acs = NIL;
282     }
283     return;
284 }
285 /* end of convertre */
286 
287 
288 /*
289  *	The following routine recognises an irregular expresion
290  *	with the following special characters:
291  *
292  *		\?	-	means last match was optional
293  *		\a	-	matches any number of characters
294  *		\d	-	matches any number of spaces and tabs
295  *		\p	-	matches any number of alphanumeric
296  *				characters. The
297  *				characters matched will be copied into
298  *				the area pointed to by 'name'.
299  *		\|	-	alternation
300  *		\( \)	-	grouping used mostly for alternation and
301  *				optionality
302  *
303  *	The irregular expression must be translated to internal form
304  *	prior to calling this routine
305  *
306  *	The value returned is the pointer to the first non \a
307  *	character matched.
308  */
309 
310 boolean _escaped;		/* true if we are currently _escaped */
311 char *Start;			/* start of string */
312 
313 char *
314 expmatch (s, re, mstring)
315     register char *s;		/* string to check for a match in */
316     register char *re;		/* a converted irregular expression */
317     register char *mstring;	/* where to put whatever matches a \p */
318 {
319     register char *cs;		/* the current symbol */
320     register char *ptr,*s1;	/* temporary pointer */
321     boolean matched;		/* a temporary boolean */
322 
323     /* initial conditions */
324     if (re == NIL)
325 	return (NIL);
326     cs = re;
327     matched = FALSE;
328 
329     /* loop till expression string is exhausted (or at least pretty tired) */
330     while (*cs) {
331 	switch (*cs & (OPER | STR | META)) {
332 
333 	/* try to match a string */
334 	case STR:
335 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
336 	    if (matched) {
337 
338 		/* hoorah it matches */
339 		s += SCNT(cs);
340 		cs = SNEXT(cs);
341 	    } else if (*cs & ALT) {
342 
343 		/* alternation, skip to next expression */
344 		cs = SNEXT(cs);
345 	    } else if (*cs & OPT) {
346 
347 		/* the match is optional */
348 		cs = SNEXT(cs);
349 		matched = 1;		/* indicate a successful match */
350 	    } else {
351 
352 		/* no match, error return */
353 		return (NIL);
354 	    }
355 	    break;
356 
357 	/* an operator, do something fancy */
358 	case OPER:
359 	    switch (OSYM(cs)) {
360 
361 	    /* this is an alternation */
362 	    case '|':
363 		if (matched)
364 
365 		    /* last thing in the alternation was a match, skip ahead */
366 		    cs = OPTR(cs);
367 		else
368 
369 		    /* no match, keep trying */
370 		    cs = ONEXT(cs);
371 		break;
372 
373 	    /* this is a grouping, recurse */
374 	    case '(':
375 		ptr = expmatch (s, ONEXT(cs), mstring);
376 		if (ptr != NIL) {
377 
378 		    /* the subexpression matched */
379 		    matched = 1;
380 		    s = ptr;
381 		} else if (*cs & ALT) {
382 
383 		    /* alternation, skip to next expression */
384 		    matched = 0;
385 		} else if (*cs & OPT) {
386 
387 		    /* the match is optional */
388 		    matched = 1;	/* indicate a successful match */
389 		} else {
390 
391 		    /* no match, error return */
392 		    return (NIL);
393 		}
394 		cs = OPTR(cs);
395 		break;
396 	    }
397 	    break;
398 
399 	/* try to match a metasymbol */
400 	case META:
401 	    switch (MSYM(cs)) {
402 
403 	    /* try to match anything and remember what was matched */
404 	    case 'p':
405 		/*
406 		 *  This is really the same as trying the match the
407 		 *  remaining parts of the expression to any subset
408 		 *  of the string.
409 		 */
410 		s1 = s;
411 		do {
412 		    ptr = expmatch (s1, MNEXT(cs), mstring);
413 		    if (ptr != NIL && s1 != s) {
414 
415 			/* we have a match, remember the match */
416 			strncpy (mstring, s, s1 - s);
417 			mstring[s1 - s] = '\0';
418 			return (ptr);
419 		    } else if (ptr != NIL && (*cs & OPT)) {
420 
421 			/* it was aoptional so no match is ok */
422 			return (ptr);
423 		    } else if (ptr != NIL) {
424 
425 			/* not optional and we still matched */
426 			return (NIL);
427 		    }
428 		    if (!isidchr(*s1))
429 			return (NIL);
430 		    if (*s1 == '\\')
431 			_escaped = _escaped ? FALSE : TRUE;
432 		    else
433 			_escaped = FALSE;
434 		} while (*s1++);
435 		return (NIL);
436 
437 	    /* try to match anything */
438 	    case 'a':
439 		/*
440 		 *  This is really the same as trying the match the
441 		 *  remaining parts of the expression to any subset
442 		 *  of the string.
443 		 */
444 		s1 = s;
445 		do {
446 		    ptr = expmatch (s1, MNEXT(cs), mstring);
447 		    if (ptr != NIL && s1 != s) {
448 
449 			/* we have a match */
450 			return (ptr);
451 		    } else if (ptr != NIL && (*cs & OPT)) {
452 
453 			/* it was aoptional so no match is ok */
454 			return (ptr);
455 		    } else if (ptr != NIL) {
456 
457 			/* not optional and we still matched */
458 			return (NIL);
459 		    }
460 		    if (*s1 == '\\')
461 			_escaped = _escaped ? FALSE : TRUE;
462 		    else
463 			_escaped = FALSE;
464 		} while (*s1++);
465 		return (NIL);
466 
467 	    /* fail if we are currently _escaped */
468 	    case 'e':
469 		if (_escaped)
470 		    return(NIL);
471 		cs = MNEXT(cs);
472 		break;
473 
474 	    /* match any number of tabs and spaces */
475 	    case 'd':
476 		ptr = s;
477 		while (*s == ' ' || *s == '\t')
478 		    s++;
479 		if (s != ptr || s == Start) {
480 
481 		    /* match, be happy */
482 		    matched = 1;
483 		    cs = MNEXT(cs);
484 		} else if (*s == '\n' || *s == '\0') {
485 
486 		    /* match, be happy */
487 		    matched = 1;
488 		    cs = MNEXT(cs);
489 		} else if (*cs & ALT) {
490 
491 		    /* try the next part */
492 		    matched = 0;
493 		    cs = MNEXT(cs);
494 		} else if (*cs & OPT) {
495 
496 		    /* doesn't matter */
497 		    matched = 1;
498 		    cs = MNEXT(cs);
499 		} else
500 
501 		    /* no match, error return */
502 		    return (NIL);
503 		break;
504 
505 	    /* check for end of line */
506 	    case '$':
507 		if (*s == '\0' || *s == '\n') {
508 
509 		    /* match, be happy */
510 		    s++;
511 		    matched = 1;
512 		    cs = MNEXT(cs);
513 		} else if (*cs & ALT) {
514 
515 		    /* try the next part */
516 		    matched = 0;
517 		    cs = MNEXT(cs);
518 		} else if (*cs & OPT) {
519 
520 		    /* doesn't matter */
521 		    matched = 1;
522 		    cs = MNEXT(cs);
523 		} else
524 
525 		    /* no match, error return */
526 		    return (NIL);
527 		break;
528 
529 	    /* check for start of line */
530 	    case '^':
531 		if (s == Start) {
532 
533 		    /* match, be happy */
534 		    matched = 1;
535 		    cs = MNEXT(cs);
536 		} else if (*cs & ALT) {
537 
538 		    /* try the next part */
539 		    matched = 0;
540 		    cs = MNEXT(cs);
541 		} else if (*cs & OPT) {
542 
543 		    /* doesn't matter */
544 		    matched = 1;
545 		    cs = MNEXT(cs);
546 		} else
547 
548 		    /* no match, error return */
549 		    return (NIL);
550 		break;
551 
552 	    /* end of a subexpression, return success */
553 	    case ')':
554 		return (s);
555 	    }
556 	    break;
557 	}
558     }
559     return (s);
560 }
561