00001 #if !defined(RXH) || defined(RX_WANT_SE_DEFS) 00002 #define RXH 00003 00004 /* Copyright (C) 1992, 1993 Free Software Foundation, Inc. 00005 00006 This file is part of the librx library. 00007 00008 Librx is free software; you can redistribute it and/or modify it under 00009 the terms of the GNU Library General Public License as published by 00010 the Free Software Foundation; either version 2, or (at your option) 00011 any later version. 00012 00013 Librx is distributed in the hope that it will be useful, but WITHOUT 00014 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 00015 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 00016 for more details. 00017 00018 You should have received a copy of the GNU Library General Public 00019 License along with this software; see the file COPYING.LIB. If not, 00020 write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA 00021 02139, USA. */ 00022 /* t. lord Wed Sep 23 18:20:57 1992 */ 00023 00024 00025 00026 00027 00028 00034 00035 00036 00037 #ifndef RX_WANT_SE_DEFS 00038 00039 /* This page: Bitsets */ 00040 00041 #ifndef RX_subset 00042 typedef unsigned int RX_subset; 00043 #define RX_subset_bits (32) 00044 #define RX_subset_mask (RX_subset_bits - 1) 00045 #endif 00046 00047 typedef RX_subset * rx_Bitset; 00048 00049 #ifdef __STDC__ 00050 typedef void (*rx_bitset_iterator) (void *, int member_index); 00051 #else 00052 typedef void (*rx_bitset_iterator) (); 00053 #endif 00054 00055 #define rx_bitset_subset(N) ((N) / RX_subset_bits) 00056 #define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)]) 00057 #define RX_bitset_access(B,N,OP) \ 00058 ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask]) 00059 #define RX_bitset_member(B,N) RX_bitset_access(B, N, &) 00060 #define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=) 00061 #define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~) 00062 #define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= ) 00063 #define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits) 00064 #define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset)) 00065 00066 00067 00068 /* This page: Splay trees. */ 00069 00070 #ifdef __STDC__ 00071 typedef int (*rx_sp_comparer) (void * a, void * b); 00072 #else 00073 typedef int (*rx_sp_comparer) (); 00074 #endif 00075 00076 struct rx_sp_node 00077 { 00078 void * key; 00079 void * data; 00080 struct rx_sp_node * kids[2]; 00081 }; 00082 00083 #ifdef __STDC__ 00084 typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *); 00085 #else 00086 typedef void (*rx_sp_key_data_freer) (); 00087 #endif 00088 00089 00090 /* giant inflatable hash trees */ 00091 00092 struct rx_hash_item 00093 { 00094 struct rx_hash_item * next_same_hash; 00095 struct rx_hash * table; 00096 unsigned long hash; 00097 void * data; 00098 void * binding; 00099 }; 00100 00101 struct rx_hash 00102 { 00103 struct rx_hash * parent; 00104 int refs; 00105 struct rx_hash * children[13]; 00106 struct rx_hash_item * buckets [13]; 00107 int bucket_size [13]; 00108 }; 00109 00110 struct rx_hash_rules; 00111 00112 #ifdef __STDC__ 00113 /* should return like == */ 00114 typedef int (*rx_hash_eq)(void *, void *); 00115 typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *); 00116 typedef void (*rx_free_hash)(struct rx_hash *, 00117 struct rx_hash_rules *); 00118 typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *, 00119 void *); 00120 typedef void (*rx_free_hash_item)(struct rx_hash_item *, 00121 struct rx_hash_rules *); 00122 #else 00123 typedef int (*rx_hash_eq)(); 00124 typedef struct rx_hash * (*rx_alloc_hash)(); 00125 typedef void (*rx_free_hash)(); 00126 typedef struct rx_hash_item * (*rx_alloc_hash_item)(); 00127 typedef void (*rx_free_hash_item)(); 00128 #endif 00129 00130 struct rx_hash_rules 00131 { 00132 rx_hash_eq eq; 00133 rx_alloc_hash hash_alloc; 00134 rx_free_hash free_hash; 00135 rx_alloc_hash_item hash_item_alloc; 00136 rx_free_hash_item free_hash_item; 00137 }; 00138 00139 00140 /* Forward declarations */ 00141 00142 struct rx_cache; 00143 struct rx_superset; 00144 struct rx; 00145 struct rx_se_list; 00146 00147 00148 00149 /* 00150 * GLOSSARY 00151 * 00152 * regexp 00153 * regular expression 00154 * expression 00155 * pattern - a `regular' expression. The expression 00156 * need not be formally regular -- it can contain 00157 * constructs that don't correspond to purely regular 00158 * expressions. 00159 * 00160 * buffer 00161 * string - the string (or strings) being searched or matched. 00162 * 00163 * pattern buffer - a structure of type `struct re_pattern_buffer' 00164 * This in turn contains a `struct rx', which holds the 00165 * NFA compiled from a pattern, as well as some of the state 00166 * of a matcher using the pattern. 00167 * 00168 * NFA - nondeterministic finite automata. Some people 00169 * use this term to a member of the class of 00170 * regular automata (those corresponding to a regular 00171 * language). However, in this code, the meaning is 00172 * more general. The automata used by Rx are comperable 00173 * in power to what are usually called `push down automata'. 00174 * 00175 * Two NFA are built by rx for every pattern. One is built 00176 * by the compiler. The other is built from the first, on 00177 * the fly, by the matcher. The latter is called the `superstate 00178 * NFA' because its states correspond to sets of states from 00179 * the first NFA. (Joe Keane gets credit for the name 00180 * `superstate NFA'). 00181 * 00182 * NFA edges 00183 * epsilon edges 00184 * side-effect edges - The NFA compiled from a pattern can have three 00185 * kinds of edges. Epsilon edges can be taken freely anytime 00186 * their source state is reached. Character set edges can be 00187 * taken when their source state is reached and when the next 00188 * character in the buffer is a member of the set. Side effect 00189 * edges imply a transition that can only be taken after the 00190 * indicated side effect has been successfully accomplished. 00191 * Some examples of side effects are: 00192 * 00193 * Storing the current match position to record the 00194 * location of a parentesized subexpression. 00195 * 00196 * Advancing the matcher over N characters if they 00197 * match the N characters previously matched by a 00198 * parentesized subexpression. 00199 * 00200 * Both of those kinds of edges occur in the NFA generated 00201 * by the pattern: \(.\)\1 00202 * 00203 * Epsilon and side effect edges are similar. Unfortunately, 00204 * some of the code uses the name `epsilon edge' to mean 00205 * both epsilon and side effect edges. For example, the 00206 * function has_non_idempotent_epsilon_path computes the existance 00207 * of a non-trivial path containing only a mix of epsilon and 00208 * side effect edges. In that case `nonidempotent epsilon' is being 00209 * used to mean `side effect'. 00210 */ 00211 00212 00213 00214 00215 00216 /* LOW LEVEL PATTERN BUFFERS */ 00217 00218 /* Suppose that from some NFA state, more than one path through 00219 * side-effect edges is possible. In what order should the paths 00220 * be tried? A function of type rx_se_list_order answers that 00221 * question. It compares two lists of side effects, and says 00222 * which list comes first. 00223 */ 00224 00225 #ifdef __STDC__ 00226 typedef int (*rx_se_list_order) (struct rx *, 00227 struct rx_se_list *, 00228 struct rx_se_list *); 00229 #else 00230 typedef int (*rx_se_list_order) (); 00231 #endif 00232 00233 00234 00235 /* Struct RX holds a compiled regular expression - that is, an nfa 00236 * ready to be converted on demand to a more efficient superstate nfa. 00237 * This is for the low level interface. The high-level interfaces enclose 00238 * this in a `struct re_pattern_buffer'. 00239 */ 00240 struct rx 00241 { 00242 /* The compiler assigns a unique id to every pattern. 00243 * Like sequence numbers in X, there is a subtle bug here 00244 * if you use Rx in a system that runs for a long time. 00245 * But, because of the way the caches work out, it is almost 00246 * impossible to trigger the Rx version of this bug. 00247 * 00248 * The id is used to validate superstates found in a cache 00249 * of superstates. It isn't sufficient to let a superstate 00250 * point back to the rx for which it was compiled -- the caller 00251 * may be re-using a `struct rx' in which case the superstate 00252 * is not really valid. So instead, superstates are validated 00253 * by checking the sequence number of the pattern for which 00254 * they were built. 00255 */ 00256 int rx_id; 00257 00258 /* This is memory mgt. state for superstates. This may be 00259 * shared by more than one struct rx. 00260 */ 00261 struct rx_cache * cache; 00262 00263 /* Every regex defines the size of its own character set. 00264 * A superstate has an array of this size, with each element 00265 * a `struct rx_inx'. So, don't make this number too large. 00266 * In particular, don't make it 2^16. 00267 */ 00268 int local_cset_size; 00269 00270 /* After the NFA is built, it is copied into a contiguous region 00271 * of memory (mostly for compatability with GNU regex). 00272 * Here is that region, and it's size: 00273 */ 00274 void * buffer; 00275 unsigned long allocated; 00276 00277 /* Clients of RX can ask for some extra storage in the space pointed 00278 * to by BUFFER. The field RESERVED is an input parameter to the 00279 * compiler. After compilation, this much space will be available 00280 * at (buffer + allocated - reserved) 00281 */ 00282 unsigned long reserved; 00283 00284 /* --------- The remaining fields are for internal use only. --------- */ 00285 /* --------- But! they must be initialized to 0. --------- */ 00286 00287 /* NODEC is the number of nodes in the NFA with non-epsilon 00288 * transitions. 00289 */ 00290 int nodec; 00291 00292 /* EPSNODEC is the number of nodes with only epsilon transitions. */ 00293 int epsnodec; 00294 00295 /* The sum (NODEC + EPSNODEC) is the total number of states in the 00296 * compiled NFA. 00297 */ 00298 00299 /* Lists of side effects as stored in the NFA are `hash consed'..meaning 00300 * that lists with the same elements are ==. During compilation, 00301 * this table facilitates hash-consing. 00302 */ 00303 struct rx_hash se_list_memo; 00304 00305 /* Lists of NFA states are also hashed. 00306 */ 00307 struct rx_hash set_list_memo; 00308 00309 00310 00311 00312 /* The compiler and matcher must build a number of instruction frames. 00313 * The format of these frames is fixed (c.f. struct rx_inx). The values 00314 * of the instructions is not fixed. 00315 * 00316 * An enumerated type (enum rx_opcode) defines the set of instructions 00317 * that the compiler or matcher might generate. When filling an instruction 00318 * frame, the INX field is found by indexing this instruction table 00319 * with an opcode: 00320 */ 00321 void ** instruction_table; 00322 00323 /* The list of all states in an NFA. 00324 * During compilation, the NEXT field of NFA states links this list. 00325 * After compilation, all the states are compacted into an array, 00326 * ordered by state id numbers. At that time, this points to the base 00327 * of that array. 00328 */ 00329 struct rx_nfa_state *nfa_states; 00330 00331 /* Every nfa begins with one distinguished starting state: 00332 */ 00333 struct rx_nfa_state *start; 00334 00335 /* This orders the search through super-nfa paths. 00336 * See the comment near the typedef of rx_se_list_order. 00337 */ 00338 rx_se_list_order se_list_cmp; 00339 00340 struct rx_superset * start_set; 00341 }; 00342 00343 00344 00345 00346 /* SYNTAX TREES */ 00347 00348 /* Compilation is in stages. 00349 * 00350 * In the first stage, a pattern specified by a string is 00351 * translated into a syntax tree. Later stages will convert 00352 * the syntax tree into an NFA optimized for conversion to a 00353 * superstate-NFA. 00354 * 00355 * This page is about syntax trees. 00356 */ 00357 00358 enum rexp_node_type 00359 { 00360 r_cset, /* Match from a character set. `a' or `[a-z]'*/ 00361 r_concat, /* Concat two subexpressions. `ab' */ 00362 r_alternate, /* Choose one of two subexpressions. `a\|b' */ 00363 r_opt, /* Optional subexpression. `a?' */ 00364 r_star, /* Repeated subexpression. `a*' */ 00365 00366 00367 /* A 2phase-star is a variation on a repeated subexpression. 00368 * In this case, there are two subexpressions. The first, if matched, 00369 * begins a repitition (otherwise, the whole expression is matches the 00370 * empth string). 00371 * 00372 * After matching the first subexpression, a 2phase star either finishes, 00373 * or matches the second subexpression. If the second subexpression is 00374 * matched, then the whole construct repeats. 00375 * 00376 * 2phase stars are used in two circumstances. First, they 00377 * are used as part of the implementation of POSIX intervals (counted 00378 * repititions). Second, they are used to implement proper star 00379 * semantics when the repeated subexpression contains paths of 00380 * only side effects. See rx_compile for more information. 00381 */ 00382 r_2phase_star, 00383 00384 00385 /* c.f. "typedef void * rx_side_effect" */ 00386 r_side_effect, 00387 00388 /* This is an extension type: It is for transient use in source->source 00389 * transformations (implemented over syntax trees). 00390 */ 00391 r_data 00392 }; 00393 00394 /* A side effect is a matcher-specific action associated with 00395 * transitions in the NFA. The details of side effects are up 00396 * to the matcher. To the compiler and superstate constructors 00397 * side effects are opaque: 00398 */ 00399 00400 typedef void * rx_side_effect; 00401 00402 /* Nodes in a syntax tree are of this type: 00403 */ 00404 struct rexp_node 00405 { 00406 enum rexp_node_type type; 00407 union 00408 { 00409 rx_Bitset cset; 00410 rx_side_effect side_effect; 00411 struct 00412 { 00413 struct rexp_node *left; 00414 struct rexp_node *right; 00415 } pair; 00416 void * data; 00417 } params; 00418 }; 00419 00420 00421 00422 /* NFA 00423 * 00424 * A syntax tree is compiled into an NFA. This page defines the structure 00425 * of that NFA. 00426 */ 00427 00428 struct rx_nfa_state 00429 { 00430 /* These are kept in a list as the NFA is being built. */ 00431 struct rx_nfa_state *next; 00432 00433 /* After the NFA is built, states are given integer id's. 00434 * States whose outgoing transitions are all either epsilon or 00435 * side effect edges are given ids less than 0. Other states 00436 * are given successive non-negative ids starting from 0. 00437 */ 00438 int id; 00439 00440 /* The list of NFA edges that go from this state to some other. */ 00441 struct rx_nfa_edge *edges; 00442 00443 /* If you land in this state, then you implicitly land 00444 * in all other states reachable by only epsilon translations. 00445 * Call the set of maximal paths to such states the epsilon closure 00446 * of this state. 00447 * 00448 * There may be other states that are reachable by a mixture of 00449 * epsilon and side effect edges. Consider the set of maximal paths 00450 * of that sort from this state. Call it the epsilon-side-effect 00451 * closure of the state. 00452 * 00453 * The epsilon closure of the state is a subset of the epsilon-side- 00454 * effect closure. It consists of all the paths that contain 00455 * no side effects -- only epsilon edges. 00456 * 00457 * The paths in the epsilon-side-effect closure can be partitioned 00458 * into equivalance sets. Two paths are equivalant if they have the 00459 * same set of side effects, in the same order. The epsilon-closure 00460 * is one of these equivalance sets. Let's call these equivalance 00461 * sets: observably equivalant path sets. That name is chosen 00462 * because equivalance of two paths means they cause the same side 00463 * effects -- so they lead to the same subsequent observations other 00464 * than that they may wind up in different target states. 00465 * 00466 * The superstate nfa, which is derived from this nfa, is based on 00467 * the observation that all of the paths in an observably equivalant 00468 * path set can be explored at the same time, provided that the 00469 * matcher keeps track not of a single nfa state, but of a set of 00470 * states. In particular, after following all the paths in an 00471 * observably equivalant set, you wind up at a set of target states. 00472 * That set of target states corresponds to one state in the 00473 * superstate NFA. 00474 * 00475 * Staticly, before matching begins, it is convenient to analyze the 00476 * nfa. Each state is labeled with a list of the observably 00477 * equivalant path sets who's union covers all the 00478 * epsilon-side-effect paths beginning in this state. This list is 00479 * called the possible futures of the state. 00480 * 00481 * A trivial example is this NFA: 00482 * s1 00483 * A ---> B 00484 * 00485 * s2 00486 * ---> C 00487 * 00488 * epsilon s1 00489 * ---------> D ------> E 00490 * 00491 * 00492 * In this example, A has two possible futures. 00493 * One invokes the side effect `s1' and contains two paths, 00494 * one ending in state B, the other in state E. 00495 * The other invokes the side effect `s2' and contains only 00496 * one path, landing in state C. 00497 */ 00498 struct rx_possible_future *futures; 00499 00500 00501 /* There are exactly two distinguished states in every NFA: */ 00502 unsigned int is_final:1; 00503 unsigned int is_start:1; 00504 00505 /* These are used during NFA construction... */ 00506 unsigned int eclosure_needed:1; 00507 unsigned int mark:1; 00508 }; 00509 00510 00511 /* An edge in an NFA is typed: */ 00512 enum rx_nfa_etype 00513 { 00514 /* A cset edge is labled with a set of characters one of which 00515 * must be matched for the edge to be taken. 00516 */ 00517 ne_cset, 00518 00519 /* An epsilon edge is taken whenever its starting state is 00520 * reached. 00521 */ 00522 ne_epsilon, 00523 00524 /* A side effect edge is taken whenever its starting state is 00525 * reached. Side effects may cause the match to fail or the 00526 * position of the matcher to advance. 00527 */ 00528 ne_side_effect /* A special kind of epsilon. */ 00529 }; 00530 00531 struct rx_nfa_edge 00532 { 00533 struct rx_nfa_edge *next; 00534 enum rx_nfa_etype type; 00535 struct rx_nfa_state *dest; 00536 union 00537 { 00538 rx_Bitset cset; 00539 rx_side_effect side_effect; 00540 } params; 00541 }; 00542 00543 00544 00545 /* A possible future consists of a list of side effects 00546 * and a set of destination states. Below are their 00547 * representations. These structures are hash-consed which 00548 * means that lists with the same elements share a representation 00549 * (their addresses are ==). 00550 */ 00551 00552 struct rx_nfa_state_set 00553 { 00554 struct rx_nfa_state * car; 00555 struct rx_nfa_state_set * cdr; 00556 }; 00557 00558 struct rx_se_list 00559 { 00560 rx_side_effect car; 00561 struct rx_se_list * cdr; 00562 }; 00563 00564 struct rx_possible_future 00565 { 00566 struct rx_possible_future *next; 00567 struct rx_se_list * effects; 00568 struct rx_nfa_state_set * destset; 00569 }; 00570 00571 00572 00573 /* This begins the description of the superstate NFA. 00574 * 00575 * The superstate NFA corresponds to the NFA in these ways: 00576 * 00577 * Every superstate NFA states SUPER correspond to sets of NFA states, 00578 * nfa_states(SUPER). 00579 * 00580 * Superstate edges correspond to NFA paths. 00581 * 00582 * The superstate has no epsilon transitions; 00583 * every edge has a character label, and a (possibly empty) side 00584 * effect label. The side effect label corresponds to a list of 00585 * side effects that occur in the NFA. These parts are referred 00586 * to as: superedge_character(EDGE) and superedge_sides(EDGE). 00587 * 00588 * For a superstate edge EDGE starting in some superstate SUPER, 00589 * the following is true (in pseudo-notation :-): 00590 * 00591 * exists DEST in nfa_states s.t. 00592 * exists nfaEDGE in nfa_edges s.t. 00593 * origin (nfaEDGE) == DEST 00594 * && origin (nfaEDGE) is a member of nfa_states(SUPER) 00595 * && exists PF in possible_futures(dest(nfaEDGE)) s.t. 00596 * sides_of_possible_future (PF) == superedge_sides (EDGE) 00597 * 00598 * also: 00599 * 00600 * let SUPER2 := superedge_destination(EDGE) 00601 * nfa_states(SUPER2) 00602 * == union of all nfa state sets S s.t. 00603 * exists PF in possible_futures(dest(nfaEDGE)) s.t. 00604 * sides_of_possible_future (PF) == superedge_sides (EDGE) 00605 * && S == dests_of_possible_future (PF) } 00606 * 00607 * Or in english, every superstate is a set of nfa states. A given 00608 * character and a superstate implies many transitions in the NFA -- 00609 * those that begin with an edge labeled with that character from a 00610 * state in the set corresponding to the superstate. 00611 * 00612 * The destinations of those transitions each have a set of possible 00613 * futures. A possible future is a list of side effects and a set of 00614 * destination NFA states. Two sets of possible futures can be 00615 * `merged' by combining all pairs of possible futures that have the 00616 * same side effects. A pair is combined by creating a new future 00617 * with the same side effect but the union of the two destination sets. 00618 * In this way, all the possible futures suggested by a superstate 00619 * and a character can be merged into a set of possible futures where 00620 * no two elements of the set have the same set of side effects. 00621 * 00622 * The destination of a possible future, being a set of NFA states, 00623 * corresponds to a supernfa state. So, the merged set of possible 00624 * futures we just created can serve as a set of edges in the 00625 * supernfa. 00626 * 00627 * The representation of the superstate nfa and the nfa is critical. 00628 * The nfa has to be compact, but has to facilitate the rapid 00629 * computation of missing superstates. The superstate nfa has to 00630 * be fast to interpret, lazilly constructed, and bounded in space. 00631 * 00632 * To facilitate interpretation, the superstate data structures are 00633 * peppered with `instruction frames'. There is an instruction set 00634 * defined below which matchers using the supernfa must be able to 00635 * interpret. 00636 * 00637 * We'd like to make it possible but not mandatory to use code 00638 * addresses to represent instructions (c.f. gcc's computed goto). 00639 * Therefore, we define an enumerated type of opcodes, and when 00640 * writing one of these instructions into a data structure, use 00641 * the opcode as an index into a table of instruction values. 00642 * 00643 * Here are the opcodes that occur in the superstate nfa: 00644 */ 00645 00646 00647 /* Every superstate contains a table of instruction frames indexed 00648 * by characters. A normal `move' in a matcher is to fetch the next 00649 * character and use it as an index into a superstates transition 00650 * table. 00651 * 00652 * In the fasted case, only one edge follows from that character. 00653 * In other cases there is more work to do. 00654 * 00655 * The descriptions of the opcodes refer to data structures that are 00656 * described further below. 00657 */ 00658 00659 enum rx_opcode 00660 { 00661 /* 00662 * BACKTRACK_POINT is invoked when a character transition in 00663 * a superstate leads to more than one edge. In that case, 00664 * the edges have to be explored independently using a backtracking 00665 * strategy. 00666 * 00667 * A BACKTRACK_POINT instruction is stored in a superstate's 00668 * transition table for some character when it is known that that 00669 * character crosses more than one edge. On encountering this 00670 * instruction, the matcher saves enough state to backtrack to this 00671 * point in the match later. 00672 */ 00673 rx_backtrack_point = 0, /* data is (struct transition_class *) */ 00674 00675 /* 00676 * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. 00677 * There is one occurence of this instruction per rx_distinct_future. 00678 * This instruction is skipped if a rx_distinct_future has no side effects. 00679 */ 00680 rx_do_side_effects = rx_backtrack_point + 1, 00681 00682 /* data is (struct rx_distinct_future *) */ 00683 00684 /* 00685 * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose 00686 * destination superstate has been reclaimed (or was never built). 00687 * It recomputes the destination superstate. 00688 * RX_CACHE_MISS is also stored in a superstate transition table before 00689 * any of its edges have been built. 00690 */ 00691 rx_cache_miss = rx_do_side_effects + 1, 00692 /* data is (struct rx_distinct_future *) */ 00693 00694 /* 00695 * RX_NEXT_CHAR is called to consume the next character and take the 00696 * corresponding transition. This is the only instruction that uses 00697 * the DATA field of the instruction frame instead of DATA_2. 00698 * (see EXPLORE_FUTURE in regex.c). 00699 */ 00700 rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ 00701 00702 /* RX_BACKTRACK indicates that a transition fails. 00703 */ 00704 rx_backtrack = rx_next_char + 1, /* no data */ 00705 00706 /* 00707 * RX_ERROR_INX is stored only in places that should never be executed. 00708 */ 00709 rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ 00710 00711 rx_num_instructions = rx_error_inx + 1 00712 }; 00713 00714 /* An id_instruction_table holds the values stored in instruction 00715 * frames. The table is indexed by the enums declared above. 00716 */ 00717 extern void * rx_id_instruction_table[rx_num_instructions]; 00718 00719 /* The heart of the matcher is a `word-code-interpreter' 00720 * (like a byte-code interpreter, except that instructions 00721 * are a full word wide). 00722 * 00723 * Instructions are not stored in a vector of code, instead, 00724 * they are scattered throughout the data structures built 00725 * by the regexp compiler and the matcher. One word-code instruction, 00726 * together with the arguments to that instruction, constitute 00727 * an instruction frame (struct rx_inx). 00728 * 00729 * This structure type is padded by hand to a power of 2 because 00730 * in one of the dominant cases, we dispatch by indexing a table 00731 * of instruction frames. If that indexing can be accomplished 00732 * by just a shift of the index, we're happy. 00733 * 00734 * Instructions take at most one argument, but there are two 00735 * slots in an instruction frame that might hold that argument. 00736 * These are called data and data_2. The data slot is only 00737 * used for one instruction (RX_NEXT_CHAR). For all other 00738 * instructions, data should be set to 0. 00739 * 00740 * RX_NEXT_CHAR is the most important instruction by far. 00741 * By reserving the data field for its exclusive use, 00742 * instruction dispatch is sped up in that case. There is 00743 * no need to fetch both the instruction and the data, 00744 * only the data is needed. In other words, a `cycle' begins 00745 * by fetching the field data. If that is non-0, then it must 00746 * be the destination state of a next_char transition, so 00747 * make that value the current state, advance the match position 00748 * by one character, and start a new cycle. On the other hand, 00749 * if data is 0, fetch the instruction and do a more complicated 00750 * dispatch on that. 00751 */ 00752 00753 struct rx_inx 00754 { 00755 void * data; 00756 void * data_2; 00757 void * inx; 00758 void * fnord; 00759 }; 00760 00761 #ifndef RX_TAIL_ARRAY 00762 #define RX_TAIL_ARRAY 1 00763 #endif 00764 00765 /* A superstate corresponds to a set of nfa states. Those sets are 00766 * represented by STRUCT RX_SUPERSET. The constructors 00767 * guarantee that only one (shared) structure is created for a given set. 00768 */ 00769 struct rx_superset 00770 { 00771 int refs; /* This is a reference counted structure. */ 00772 00773 /* We keep these sets in a cache because (in an unpredictable way), 00774 * the same set is often created again and again. But that is also 00775 * problematic -- compatibility with POSIX and GNU regex requires 00776 * that we not be able to tell when a program discards a particular 00777 * NFA (thus invalidating the supersets created from it). 00778 * 00779 * But when a cache hit appears to occur, we will have in hand the 00780 * nfa for which it may have happened. That is why every nfa is given 00781 * its own sequence number. On a cache hit, the cache is validated 00782 * by comparing the nfa sequence number to this field: 00783 */ 00784 int id; 00785 00786 struct rx_nfa_state * car; /* May or may not be a valid addr. */ 00787 struct rx_superset * cdr; 00788 00789 /* If the corresponding superstate exists: */ 00790 struct rx_superstate * superstate; 00791 00792 00793 /* There is another bookkeeping problem. It is expensive to 00794 * compute the starting nfa state set for an nfa. So, once computed, 00795 * it is cached in the `struct rx'. 00796 * 00797 * But, the state set can be flushed from the superstate cache. 00798 * When that happens, we can't know if the corresponding `struct rx' 00799 * is still alive or if it has been freed or re-used by the program. 00800 * So, the cached pointer to this set in a struct rx might be invalid 00801 * and we need a way to validate it. 00802 * 00803 * Fortunately, even if this set is flushed from the cache, it is 00804 * not freed. It just goes on the free-list of supersets. 00805 * So we can still examine it. 00806 * 00807 * So to validate a starting set memo, check to see if the 00808 * starts_for field still points back to the struct rx in question, 00809 * and if the ID matches the rx sequence number. 00810 */ 00811 struct rx * starts_for; 00812 00813 /* This is used to link into a hash bucket so these objects can 00814 * be `hash-consed'. 00815 */ 00816 struct rx_hash_item hash_item; 00817 }; 00818 00819 #define rx_protect_superset(RX,CON) (++(CON)->refs) 00820 00821 /* The terminology may be confusing (rename this structure?). 00822 * Every character occurs in at most one rx_super_edge per super-state. 00823 * But, that structure might have more than one option, indicating a point 00824 * of non-determinism. 00825 * 00826 * In other words, this structure holds a list of superstate edges 00827 * sharing a common starting state and character label. The edges 00828 * are in the field OPTIONS. All superstate edges sharing the same 00829 * starting state and character are in this list. 00830 */ 00831 struct rx_super_edge 00832 { 00833 struct rx_super_edge *next; 00834 struct rx_inx rx_backtrack_frame; 00835 int cset_size; 00836 rx_Bitset cset; 00837 struct rx_distinct_future *options; 00838 }; 00839 00840 /* A superstate is a set of nfa states (RX_SUPERSET) along 00841 * with a transition table. Superstates are built on demand and reclaimed 00842 * without warning. To protect a superstate from this ghastly fate, 00843 * use LOCK_SUPERSTATE. 00844 */ 00845 struct rx_superstate 00846 { 00847 int rx_id; /* c.f. the id field of rx_superset */ 00848 int locks; /* protection from reclamation */ 00849 00850 /* Within a superstate cache, all the superstates are kept in a big 00851 * queue. The tail of the queue is the state most likely to be 00852 * reclaimed. The *recyclable fields hold the queue position of 00853 * this state. 00854 */ 00855 struct rx_superstate * next_recyclable; 00856 struct rx_superstate * prev_recyclable; 00857 00858 /* The supernfa edges that exist in the cache and that have 00859 * this state as their destination are kept in this list: 00860 */ 00861 struct rx_distinct_future * transition_refs; 00862 00863 /* The list of nfa states corresponding to this superstate: */ 00864 struct rx_superset * contents; 00865 00866 /* The list of edges in the cache beginning from this state. */ 00867 struct rx_super_edge * edges; 00868 00869 /* A tail of the recyclable queue is marked as semifree. A semifree 00870 * state has no incoming next_char transitions -- any transition 00871 * into a semifree state causes a complex dispatch with the side 00872 * effect of rescuing the state from its semifree state. 00873 * 00874 * An alternative to this might be to make next_char more expensive, 00875 * and to move a state to the head of the recyclable queue whenever 00876 * it is entered. That way, popular states would never be recycled. 00877 * 00878 * But unilaterally making next_char more expensive actually loses. 00879 * So, incoming transitions are only made expensive for states near 00880 * the tail of the recyclable queue. The more cache contention 00881 * there is, the more frequently a state will have to prove itself 00882 * and be moved back to the front of the queue. If there is less 00883 * contention, then popular states just aggregate in the front of 00884 * the queue and stay there. 00885 */ 00886 int is_semifree; 00887 00888 00889 /* This keeps track of the size of the transition table for this 00890 * state. There is a half-hearted attempt to support variable sized 00891 * superstates. 00892 */ 00893 int trans_size; 00894 00895 /* Indexed by characters... */ 00896 struct rx_inx transitions[RX_TAIL_ARRAY]; 00897 }; 00898 00899 00900 /* A list of distinct futures define the edges that leave from a 00901 * given superstate on a given character. c.f. rx_super_edge. 00902 */ 00903 00904 struct rx_distinct_future 00905 { 00906 struct rx_distinct_future * next_same_super_edge[2]; 00907 struct rx_distinct_future * next_same_dest; 00908 struct rx_distinct_future * prev_same_dest; 00909 struct rx_superstate * present; /* source state */ 00910 struct rx_superstate * future; /* destination state */ 00911 struct rx_super_edge * edge; 00912 00913 00914 /* The future_frame holds the instruction that should be executed 00915 * after all the side effects are done, when it is time to complete 00916 * the transition to the next state. 00917 * 00918 * Normally this is a next_char instruction, but it may be a 00919 * cache_miss instruction as well, depending on whether or not 00920 * the superstate is in the cache and semifree. 00921 * 00922 * If this is the only future for a given superstate/char, and 00923 * if there are no side effects to be performed, this frame is 00924 * not used (directly) at all. Instead, its contents are copied 00925 * into the transition table of the starting state of this dist. future. 00926 */ 00927 struct rx_inx future_frame; 00928 00929 struct rx_inx side_effects_frame; 00930 struct rx_se_list * effects; 00931 }; 00932 00933 #define rx_lock_superstate(R,S) ((S)->locks++) 00934 #define rx_unlock_superstate(R,S) (--(S)->locks) 00935 00936 00937 /* This page destined for rx.h */ 00938 00939 struct rx_blocklist 00940 { 00941 struct rx_blocklist * next; 00942 int bytes; 00943 }; 00944 00945 struct rx_freelist 00946 { 00947 struct rx_freelist * next; 00948 }; 00949 00950 struct rx_cache; 00951 00952 #ifdef __STDC__ 00953 typedef void (*rx_morecore_fn)(struct rx_cache *); 00954 #else 00955 typedef void (*rx_morecore_fn)(); 00956 #endif 00957 00958 /* You use this to control the allocation of superstate data 00959 * during matching. Most of it should be initialized to 0. 00960 * 00961 * A MORECORE function is necessary. It should allocate 00962 * a new block of memory or return 0. 00963 * A default that uses malloc is called `rx_morecore'. 00964 * 00965 * The number of SUPERSTATES_ALLOWED indirectly limits how much memory 00966 * the system will try to allocate. The default is 128. Batch style 00967 * applications that are very regexp intensive should use as high a number 00968 * as possible without thrashing. 00969 * 00970 * The LOCAL_CSET_SIZE is the number of characters in a character set. 00971 * It is therefore the number of entries in a superstate transition table. 00972 * Generally, it should be 256. If your character set has 16 bits, 00973 * it is better to translate your regexps into equivalent 8 bit patterns. 00974 */ 00975 00976 struct rx_cache 00977 { 00978 struct rx_hash_rules superset_hash_rules; 00979 00980 /* Objects are allocated by incrementing a pointer that 00981 * scans across rx_blocklists. 00982 */ 00983 struct rx_blocklist * memory; 00984 struct rx_blocklist * memory_pos; 00985 int bytes_left; 00986 char * memory_addr; 00987 rx_morecore_fn morecore; 00988 00989 /* Freelists. */ 00990 struct rx_freelist * free_superstates; 00991 struct rx_freelist * free_transition_classes; 00992 struct rx_freelist * free_discernable_futures; 00993 struct rx_freelist * free_supersets; 00994 struct rx_freelist * free_hash; 00995 00996 /* Two sets of superstates -- those that are semifreed, and those 00997 * that are being used. 00998 */ 00999 struct rx_superstate * lru_superstate; 01000 struct rx_superstate * semifree_superstate; 01001 01002 struct rx_superset * empty_superset; 01003 01004 int superstates; 01005 int semifree_superstates; 01006 int hits; 01007 int misses; 01008 int superstates_allowed; 01009 01010 int local_cset_size; 01011 void ** instruction_table; 01012 01013 struct rx_hash superset_table; 01014 }; 01015 01016 01017 01018 /* The lowest-level search function supports arbitrarily fragmented 01019 * strings and (optionally) suspendable/resumable searches. 01020 * 01021 * Callers have to provide a few hooks. 01022 */ 01023 01024 #ifndef __GNUC__ 01025 #ifdef __STDC__ 01026 #define __const__ const 01027 #else 01028 #define __const__ 01029 #endif 01030 #endif 01031 01032 /* This holds a matcher position */ 01033 struct rx_string_position 01034 { 01035 __const__ unsigned char * pos; /* The current pos. */ 01036 __const__ unsigned char * string; /* The current string burst. */ 01037 __const__ unsigned char * end; /* First invalid position >= POS. */ 01038 int offset; /* Integer address of the current burst. */ 01039 int size; /* Current string's size. */ 01040 int search_direction; /* 1 or -1 */ 01041 int search_end; /* First position to not try. */ 01042 }; 01043 01044 01045 enum rx_get_burst_return 01046 { 01047 rx_get_burst_continuation, 01048 rx_get_burst_error, 01049 rx_get_burst_ok, 01050 rx_get_burst_no_more 01051 }; 01052 01053 01054 /* A call to get burst should make POS valid. It might be invalid 01055 * if the STRING field doesn't point to a burst that actually 01056 * contains POS. 01057 * 01058 * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to 01059 * whether or not to pad to the left. Padding to the right is always 01060 * appropriate, but need not go past the point indicated by STOP. 01061 * 01062 * If a continuation is returned, then the reentering call to 01063 * a search function will retry the get_burst. 01064 */ 01065 01066 #ifdef __STDC__ 01067 typedef enum rx_get_burst_return 01068 (*rx_get_burst_fn) (struct rx_string_position * pos, 01069 void * app_closure, 01070 int stop); 01071 01072 #else 01073 typedef enum rx_get_burst_return (*rx_get_burst_fn) (); 01074 #endif 01075 01076 01077 enum rx_back_check_return 01078 { 01079 rx_back_check_continuation, 01080 rx_back_check_error, 01081 rx_back_check_pass, 01082 rx_back_check_fail 01083 }; 01084 01085 /* Back_check should advance the position it is passed 01086 * over rparen - lparen characters and return pass iff 01087 * the characters starting at POS match those indexed 01088 * by [LPAREN..RPAREN]. 01089 * 01090 * If a continuation is returned, then the reentering call to 01091 * a search function will retry the back_check. 01092 */ 01093 01094 #ifdef __STDC__ 01095 typedef enum rx_back_check_return 01096 (*rx_back_check_fn) (struct rx_string_position * pos, 01097 int lparen, 01098 int rparen, 01099 unsigned char * translate, 01100 void * app_closure, 01101 int stop); 01102 01103 #else 01104 typedef enum rx_back_check_return (*rx_back_check_fn) (); 01105 #endif 01106 01107 01108 01109 01110 /* A call to fetch_char should return the character at POS or POS + 1. 01111 * Returning continuations here isn't supported. OFFSET is either 0 or 1 01112 * and indicates which characters is desired. 01113 */ 01114 01115 #ifdef __STDC__ 01116 typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos, 01117 int offset, 01118 void * app_closure, 01119 int stop); 01120 #else 01121 typedef int (*rx_fetch_char_fn) (); 01122 #endif 01123 01124 01125 enum rx_search_return 01126 { 01127 rx_search_continuation = -4, 01128 rx_search_error = -3, 01129 rx_search_soft_fail = -2, /* failed by running out of string */ 01130 rx_search_fail = -1 /* failed only by reaching failure states */ 01131 /* return values >= 0 indicate the position of a successful match */ 01132 }; 01133 01134 01135 01136 01137 01138 01139 /* regex.h 01140 * 01141 * The remaining declarations replace regex.h. 01142 */ 01143 01144 /* This is an array of error messages corresponding to the error codes. 01145 */ 01146 extern __const__ char *re_error_msg[]; 01147 01148 /* If any error codes are removed, changed, or added, update the 01149 `re_error_msg' table in regex.c. */ 01150 typedef enum 01151 { 01152 REG_NOERROR = 0, /* Success. */ 01153 REG_NOMATCH, /* Didn't find a match (for regexec). */ 01154 01155 /* POSIX regcomp return error codes. (In the order listed in the 01156 standard.) */ 01157 REG_BADPAT, /* Invalid pattern. */ 01158 REG_ECOLLATE, /* Not implemented. */ 01159 REG_ECTYPE, /* Invalid character class name. */ 01160 REG_EESCAPE, /* Trailing backslash. */ 01161 REG_ESUBREG, /* Invalid back reference. */ 01162 REG_EBRACK, /* Unmatched left bracket. */ 01163 REG_EPAREN, /* Parenthesis imbalance. */ 01164 REG_EBRACE, /* Unmatched \{. */ 01165 REG_BADBR, /* Invalid contents of \{\}. */ 01166 REG_ERANGE, /* Invalid range end. */ 01167 REG_ESPACE, /* Ran out of memory. */ 01168 REG_BADRPT, /* No preceding re for repetition op. */ 01169 01170 /* Error codes we've added. */ 01171 REG_EEND, /* Premature end. */ 01172 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ 01173 REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ 01174 } reg_errcode_t; 01175 01176 /* The regex.c support, as a client of rx, defines a set of possible 01177 * side effects that can be added to the edge lables of nfa edges. 01178 * Here is the list of sidef effects in use. 01179 */ 01180 01181 enum re_side_effects 01182 { 01183 #define RX_WANT_SE_DEFS 1 01184 #undef RX_DEF_SE 01185 #undef RX_DEF_CPLX_SE 01186 #define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE, 01187 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE, 01188 #include "MNrx.h" 01189 #undef RX_DEF_SE 01190 #undef RX_DEF_CPLX_SE 01191 #undef RX_WANT_SE_DEFS 01192 re_floogle_flap = 65533 01193 }; 01194 01195 /* These hold paramaters for the kinds of side effects that are possible 01196 * in the supported pattern languages. These include things like the 01197 * numeric bounds of {} operators and the index of paren registers for 01198 * subexpression measurement or backreferencing. 01199 */ 01200 struct re_se_params 01201 { 01202 enum re_side_effects se; 01203 int op1; 01204 int op2; 01205 }; 01206 01207 typedef unsigned reg_syntax_t; 01208 01209 struct re_pattern_buffer 01210 { 01211 struct rx rx; 01212 reg_syntax_t syntax; /* See below for syntax bit definitions. */ 01213 01214 unsigned int no_sub:1; /* If set, don't return register offsets. */ 01215 unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */ 01216 unsigned int not_eol:1; /* match at the ends of the string. */ 01217 unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/ 01218 unsigned int least_subs:1; /* If set, and returning registers, return 01219 * as few values as possible. Only 01220 * backreferenced groups and group 0 (the whole 01221 * match) will be returned. 01222 */ 01223 01224 /* If true, this says that the matcher should keep registers on its 01225 * backtracking stack. For many patterns, we can easily determine that 01226 * this isn't necessary. 01227 */ 01228 unsigned int match_regs_on_stack:1; 01229 unsigned int search_regs_on_stack:1; 01230 01231 /* is_anchored and begbuf_only are filled in by rx_compile. */ 01232 unsigned int is_anchored:1; /* Anchorded by ^? */ 01233 unsigned int begbuf_only:1; /* Anchored to char position 0? */ 01234 01235 01236 /* If REGS_UNALLOCATED, allocate space in the `regs' structure 01237 * for `max (RE_NREGS, re_nsub + 1)' groups. 01238 * If REGS_REALLOCATE, reallocate space if necessary. 01239 * If REGS_FIXED, use what's there. 01240 */ 01241 #define REGS_UNALLOCATED 0 01242 #define REGS_REALLOCATE 1 01243 #define REGS_FIXED 2 01244 unsigned int regs_allocated:2; 01245 01246 01247 /* Either a translate table to apply to all characters before 01248 * comparing them, or zero for no translation. The translation 01249 * is applied to a pattern when it is compiled and to a string 01250 * when it is matched. 01251 */ 01252 unsigned char * translate; 01253 01254 /* If this is a valid pointer, it tells rx not to store the extents of 01255 * certain subexpressions (those corresponding to non-zero entries). 01256 * Passing 0x1 is the same as passing an array of all ones. Passing 0x0 01257 * is the same as passing an array of all zeros. 01258 * The array should contain as many entries as their are subexps in the 01259 * regexp. 01260 * 01261 * For POSIX compatability, when using regcomp and regexec this field 01262 * is zeroed and ignored. 01263 */ 01264 char * syntax_parens; 01265 01266 /* Number of subexpressions found by the compiler. */ 01267 size_t re_nsub; 01268 01269 void * buffer; /* Malloced memory for the nfa. */ 01270 unsigned long allocated; /* Size of that memory. */ 01271 01272 /* Pointer to a fastmap, if any, otherwise zero. re_search uses 01273 * the fastmap, if there is one, to skip over impossible 01274 * starting points for matches. */ 01275 char *fastmap; 01276 01277 unsigned int fastmap_accurate:1; /* These three are internal. */ 01278 unsigned int can_match_empty:1; 01279 struct rx_nfa_state * start; /* The nfa starting state. */ 01280 01281 /* This is the list of iterator bounds for {lo,hi} constructs. 01282 * The memory pointed to is part of the rx->buffer. 01283 */ 01284 struct re_se_params *se_params; 01285 01286 /* This is a bitset representation of the fastmap. 01287 * This is a true fastmap that already takes the translate 01288 * table into account. 01289 */ 01290 rx_Bitset fastset; 01291 }; 01292 01293 /* Type for byte offsets within the string. POSIX mandates this. */ 01294 typedef int regoff_t; 01295 01296 /* This is the structure we store register match data in. See 01297 regex.texinfo for a full description of what registers match. */ 01298 struct re_registers 01299 { 01300 unsigned num_regs; 01301 regoff_t *start; 01302 regoff_t *end; 01303 }; 01304 01305 typedef struct re_pattern_buffer regex_t; 01306 01307 /* POSIX specification for registers. Aside from the different names than 01308 `re_registers', POSIX uses an array of structures, instead of a 01309 structure of arrays. */ 01310 typedef struct 01311 { 01312 regoff_t rm_so; /* Byte offset from string's start to substring's start. */ 01313 regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ 01314 } regmatch_t; 01315 01316 01317 /* The following bits are used to determine the regexp syntax we 01318 recognize. The set/not-set meanings are chosen so that Emacs syntax 01319 remains the value 0. The bits are given in alphabetical order, and 01320 the definitions shifted by one from the previous bit; thus, when we 01321 add or remove a bit, only one other definition need change. */ 01322 01323 /* If this bit is not set, then \ inside a bracket expression is literal. 01324 If set, then such a \ quotes the following character. */ 01325 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1) 01326 01327 /* If this bit is not set, then + and ? are operators, and \+ and \? are 01328 literals. 01329 If set, then \+ and \? are operators and + and ? are literals. */ 01330 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) 01331 01332 /* If this bit is set, then character classes are supported. They are: 01333 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], 01334 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. 01335 If not set, then character classes are not supported. */ 01336 #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) 01337 01338 /* If this bit is set, then ^ and $ are always anchors (outside bracket 01339 expressions, of course). 01340 If this bit is not set, then it depends: 01341 ^ is an anchor if it is at the beginning of a regular 01342 expression or after an open-group or an alternation operator; 01343 $ is an anchor if it is at the end of a regular expression, or 01344 before a close-group or an alternation operator. 01345 01346 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because 01347 POSIX draft 11.2 says that * etc. in leading positions is undefined. 01348 We already implemented a previous draft which made those constructs 01349 invalid, though, so we haven't changed the code back. */ 01350 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) 01351 01352 /* If this bit is set, then special characters are always special 01353 regardless of where they are in the pattern. 01354 If this bit is not set, then special characters are special only in 01355 some contexts; otherwise they are ordinary. Specifically, 01356 * + ? and intervals are only special when not after the beginning, 01357 open-group, or alternation operator. */ 01358 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) 01359 01360 /* If this bit is set, then *, +, ?, and { cannot be first in an re or 01361 immediately after an alternation or begin-group operator. */ 01362 #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) 01363 01364 /* If this bit is set, then . matches newline. 01365 If not set, then it doesn't. */ 01366 #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) 01367 01368 /* If this bit is set, then . doesn't match NUL. 01369 If not set, then it does. */ 01370 #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) 01371 01372 /* If this bit is set, nonmatching lists [^...] do not match newline. 01373 If not set, they do. */ 01374 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) 01375 01376 /* If this bit is set, either \{...\} or {...} defines an 01377 interval, depending on RE_NO_BK_BRACES. 01378 If not set, \{, \}, {, and } are literals. */ 01379 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) 01380 01381 /* If this bit is set, +, ? and | aren't recognized as operators. 01382 If not set, they are. */ 01383 #define RE_LIMITED_OPS (RE_INTERVALS << 1) 01384 01385 /* If this bit is set, newline is an alternation operator. 01386 If not set, newline is literal. */ 01387 #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) 01388 01389 /* If this bit is set, then `{...}' defines an interval, and \{ and \} 01390 are literals. 01391 If not set, then `\{...\}' defines an interval. */ 01392 #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) 01393 01394 /* If this bit is set, (...) defines a group, and \( and \) are literals. 01395 If not set, \(...\) defines a group, and ( and ) are literals. */ 01396 #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) 01397 01398 /* If this bit is set, then <digit> matches <digit>. 01399 If not set, then <digit> is a back-reference. */ 01400 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) 01401 01402 /* If this bit is set, then | is an alternation operator, and \| is literal. 01403 If not set, then \| is an alternation operator, and | is literal. */ 01404 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) 01405 01406 /* If this bit is set, then an ending range point collating higher 01407 than the starting range point, as in [z-a], is invalid. 01408 If not set, then when ending range point collates higher than the 01409 starting range point, the range is ignored. */ 01410 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) 01411 01412 /* If this bit is set, then an unmatched ) is ordinary. 01413 If not set, then an unmatched ) is invalid. */ 01414 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) 01415 01416 /* This global variable defines the particular regexp syntax to use (for 01417 some interfaces). When a regexp is compiled, the syntax used is 01418 stored in the pattern buffer, so changing this does not affect 01419 already-compiled regexps.