/* * Copyright (c) 2002 by The XFree86 Project, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Except as contained in this notice, the name of the XFree86 Project shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in this Software without prior written authorization from the * XFree86 Project. * * Author: Paulo César Pereira de Andrade */ /* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */ #include "re.h" #ifndef _rep_h #define _rep_h /* * Local defines */ #ifdef MIN #undef MIN #endif #define MIN(a, b) ((a) < (b) ? (a) : (b)) #ifdef MAX #undef MAX #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) /* This value can not be larger than 255, a depth value is the nesting of * repetition operations and alternatives. The number of nested parenthesis * does not matter, but a repetition on the pattern inside the parenthesis * does. Note also that you cannot have more than 9 parenthesis pairs in * an expression. * Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed * 7 complex repetitions. A complex repetition is a dot followed by an * repetition operator. It is called a complex repetition because dot * matches anything but the empty string, so the engine needs to test * all possible combinations until the end of the string is found. * Repetitions like .* use one depth until the end of the string is found, * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2. */ #define MAX_DEPTH 8 /* Minimum number of strings to generate a "large" string list, that is, * sort the strings and allocate 512 extra bytes to map the first string * with a given initial byte. */ #define LARGE_STL_COUNT 16 /* * Local types */ /* Intermediate compilation types declaration */ /* (r)egular (e)xpression (c)ompile (c)a(se) */ typedef struct _rec_cse rec_cse; /* (r)egular (e)xpression (c)ompile (r)a(ng)e */ typedef struct _rec_rng rec_rng; /* (r)egular (e)xpression (c)ompile (pat)tern */ typedef struct _rec_pat rec_pat; /* (r)egular (e)xpression (c)ompile (rep)etition */ typedef struct _rec_rep rec_rep; /* (r)egular (e)xpression (c)ompile (gr)ou(p) */ typedef struct _rec_grp rec_grp; /* (r)egular (e)xpression (c)ompile (alt)ernatives */ typedef struct _rec_alt rec_alt; /* Optimization types */ /* (r)egular (e)xpression (c)ompile (st)ring (l)ist */ typedef struct _rec_stl rec_stl; /* Final compilation and execution types */ /* (re)gular expression (inf)ormation */ typedef struct _re_inf re_inf; /* (re)gular expression (eng)ine */ typedef struct _re_eng re_eng; /* Codes used by the engine */ typedef enum { /* Grouping */ Re_Open, /* ( */ Re_Close, /* ) */ Re_Update, /* Like Re_Close, but is inside a loop */ /* Alternatives */ Re_Alt, /* Start alternative list, + next offset */ Re_AltNext, /* Next alternative, + next offset */ Re_AltDone, /* Finish alternative list */ /* Repetition */ Re_AnyTimes, /* * */ Re_Maybe, /* ? */ Re_AtLeast, /* +, at least one */ /* Repetition like */ Re_AnyAnyTimes, /* .* */ Re_AnyMaybe, /* .? */ Re_AnyAtLeast, /* .+ */ Re_AnyEatAnyTimes, /* Expression ends with .* */ Re_AnyEatMaybe, /* Expression ends with .? */ Re_AnyEatAtLeast, /* Expression ends with .+ */ /* Repetition with arguments */ Re_Exact, /* {e} */ Re_Min, /* {n,} */ Re_Max, /* {,m} */ Re_MinMax, /* {n,m} */ /* Repetition helper instruction */ Re_RepJump, /* Special code, go back to repetition */ Re_RepLongJump, /* Jump needs two bytes */ /* After the repetition data, all repetitions have an offset * to the code after the repetition */ /* Matching */ Re_Any, /* . */ Re_Odigit, /* \o */ Re_OdigitNot, /* \O */ Re_Digit, /* \d */ Re_DigitNot, /* \D */ Re_Xdigit, /* \x */ Re_XdigitNot, /* \x */ Re_Space, /* \s */ Re_SpaceNot, /* \S */ Re_Tab, /* \t */ Re_Newline, /* \n */ Re_Lower, /* \l */ Re_Upper, /* \u */ Re_Alnum, /* \w */ Re_AlnumNot, /* \W */ Re_Control, /* \c */ Re_ControlNot, /* \C */ Re_Bol, /* ^ */ Re_Eol, /* $ */ Re_Bow, /* \< */ Re_Eow, /* \> */ /* Range matching information */ Re_Range, /* + 256 bytes */ Re_RangeNot, /* + 256 bytes */ /* Matching with arguments */ Re_Literal, /* + character */ Re_CaseLiteral, /* + lower + upper */ Re_LiteralNot, /* + character */ Re_CaseLiteralNot, /* + lower + upper */ Re_String, /* + length + string */ Re_CaseString, /* + length + string in format lower-upper */ /* These are useful to start matching, or when RE_NOSPEC is used. */ Re_SearchLiteral, Re_SearchCaseLiteral, Re_SearchString, Re_SearchCaseString, Re_StringList, /* + total-length + lengths + strings */ Re_CaseStringList, /* + total-length + lengths + strings */ Re_LargeStringList, /* + total-length + lengths + map + strings */ Re_LargeCaseStringList, /* + total-length + lengths + map + strings */ /* Backreference */ Re_Backref, /* + reference number */ /* The last codes */ Re_DoneIf, /* Done if at end of input */ Re_MaybeDone, /* Done */ Re_Done /* If this code found, finished execution */ } ReCode; /* (r)egular (e)xpresssion (pat)rern (t)ype */ typedef enum _rec_pat_t { Rep_Literal = Re_Literal, Rep_CaseLiteral = Re_CaseLiteral, Rep_LiteralNot = Re_LiteralNot, Rep_CaseLiteralNot = Re_CaseLiteralNot, Rep_Range = Re_Range, Rep_RangeNot = Re_RangeNot, Rep_String = Re_String, Rep_CaseString = Re_CaseString, Rep_SearchLiteral = Re_SearchLiteral, Rep_SearchCaseLiteral = Re_SearchCaseLiteral, Rep_SearchString = Re_SearchString, Rep_SearchCaseString = Re_SearchCaseString, Rep_Any = Re_Any, Rep_AnyAnyTimes = Re_AnyAnyTimes, Rep_AnyEatAnyTimes = Re_AnyEatAnyTimes, Rep_AnyMaybe = Re_AnyMaybe, Rep_AnyEatMaybe = Re_AnyEatMaybe, Rep_AnyAtLeast = Re_AnyAtLeast, Rep_AnyEatAtLeast = Re_AnyEatAtLeast, Rep_Odigit = Re_Odigit, Rep_OdigitNot = Re_OdigitNot, Rep_Digit = Re_Digit, Rep_DigitNot = Re_DigitNot, Rep_Xdigit = Re_Xdigit, Rep_XdigitNot = Re_XdigitNot, Rep_Space = Re_Space, Rep_SpaceNot = Re_SpaceNot, Rep_Tab = Re_Tab, Rep_Newline = Re_Newline, Rep_Lower = Re_Lower, Rep_Upper = Re_Upper, Rep_Alnum = Re_Alnum, Rep_AlnumNot = Re_AlnumNot, Rep_Control = Re_Control, Rep_ControlNot = Re_ControlNot, Rep_Bol = Re_Bol, Rep_Eol = Re_Eol, Rep_Bow = Re_Bow, Rep_Eow = Re_Eow, Rep_Backref = Re_Backref, Rep_StringList = Re_StringList, Rep_Group = Re_Open } rec_pat_t; /* (r)egular (e)xpression (rep)etition (t)ype */ typedef enum _rec_rep_t { Rer_AnyTimes = Re_AnyTimes, Rer_AtLeast = Re_AtLeast, Rer_Maybe = Re_Maybe, Rer_Exact = Re_Exact, Rer_Min = Re_Min, Rer_Max = Re_Max, Rer_MinMax = Re_MinMax } rec_rep_t; /* Decide at re compilation time what is lowercase and what is uppercase */ struct _rec_cse { unsigned char lower; unsigned char upper; }; /* A rec_rng is used only during compilation, just a character map */ struct _rec_rng { unsigned char range[256]; }; /* A rec_pat is used only during compilation, and can be viewed as * a regular expression element like a match to any character, a match * to the beginning or end of the line, etc. * It is implemented as a linked list, and does not have nesting. * The data field can contain: * chr: the value of a single character to match. * cse: the upper and lower case value of a character to match. * rng: a character map to match or not match. * str: a simple string or a string where every two bytes * represents the character to match, in lower/upper * case sequence. * The rep field is not used for strings, strings are broken in the * last character in this case. That is, strings are just a concatenation * of several character matches. */ struct _rec_pat { rec_pat_t type; rec_pat *next, *prev; /* Linked list information */ union { unsigned char chr; rec_cse cse; rec_rng *rng; rec_grp *grp; unsigned char *str; rec_stl *stl; } data; rec_rep *rep; /* Pattern repetition information */ }; /* A rec_rep is used only during compilation, and can be viewed as: * * ? or * or + or {} or {,} or {,} or {,} * * where is "exact", is "minimum" and is "maximum". * In the compiled step it can also be just a NULL pointer, that * is actually equivalent to {1}. */ struct _rec_rep { rec_rep_t type; short mine; /* minimum or exact number of matches */ short maxc; /* maximum number of matches */ }; /* A rec_alt is used only during compilation, and can be viewed as: * * | * * where is any regular expression. The expressions are nested * using the grp field of the rec_pat structure. */ struct _rec_alt { rec_alt *next, *prev; /* Linked list information */ rec_pat *pat; }; /* A rec_grp is a place holder for expressions enclosed in parenthesis * and is linked to the compilation data by an rec_pat structure. */ struct _rec_grp { rec_pat *parent; /* Reference to parent pattern */ rec_alt *alt; /* The pattern information */ rec_alt *palt; /* Parent alternative */ rec_grp *pgrp; /* Nested groups */ int comp; /* (comp)lex repetition pattern inside group */ }; /* Optimization compilation types definition */ /* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */ typedef enum { Resl_StringList = Re_StringList, Resl_CaseStringList = Re_CaseStringList } rec_stl_t; struct _rec_stl { rec_stl_t type; int nstrs; /* Number of strings in list */ int tlen; /* Total length of all strings */ unsigned char *lens; /* Vector of string lengths */ unsigned char **strs; /* The strings */ }; /* * Prototypes */ /* rep.c */ rec_alt *irec_comp(const char*, const char*, int, int*); void irec_free_alt(rec_alt*); /* reo.c */ int orec_comp(rec_alt*, int); void orec_free_stl(rec_stl*); #endif /* _rep_h */