Main Page | Modules | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

MNrx.h

Go to the documentation of this file.
00001 #if !defined(RXH) || defined(RX_WANT_SE_DEFS) 00002 #define RXH 00003 00004 /* Copyright (C) 1992, 1993 Free Software Foundation, Inc. 00005 00006 This file is part of the librx library. 00007 00008 Librx is free software; you can redistribute it and/or modify it under 00009 the terms of the GNU Library General Public License as published by 00010 the Free Software Foundation; either version 2, or (at your option) 00011 any later version. 00012 00013 Librx is distributed in the hope that it will be useful, but WITHOUT 00014 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 00015 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 00016 for more details. 00017 00018 You should have received a copy of the GNU Library General Public 00019 License along with this software; see the file COPYING.LIB. If not, 00020 write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA 00021 02139, USA. */ 00022 /* t. lord Wed Sep 23 18:20:57 1992 */ 00023 00024 00025 00026 00027 00028 00034 00035 00036 00037 #ifndef RX_WANT_SE_DEFS 00038 00039 /* This page: Bitsets */ 00040 00041 #ifndef RX_subset 00042 typedef unsigned int RX_subset; 00043 #define RX_subset_bits (32) 00044 #define RX_subset_mask (RX_subset_bits - 1) 00045 #endif 00046 00047 typedef RX_subset * rx_Bitset; 00048 00049 #ifdef __STDC__ 00050 typedef void (*rx_bitset_iterator) (void *, int member_index); 00051 #else 00052 typedef void (*rx_bitset_iterator) (); 00053 #endif 00054 00055 #define rx_bitset_subset(N) ((N) / RX_subset_bits) 00056 #define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)]) 00057 #define RX_bitset_access(B,N,OP) \ 00058 ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask]) 00059 #define RX_bitset_member(B,N) RX_bitset_access(B, N, &) 00060 #define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=) 00061 #define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~) 00062 #define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= ) 00063 #define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits) 00064 #define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset)) 00065 00066 00067 00068 /* This page: Splay trees. */ 00069 00070 #ifdef __STDC__ 00071 typedef int (*rx_sp_comparer) (void * a, void * b); 00072 #else 00073 typedef int (*rx_sp_comparer) (); 00074 #endif 00075 00076 struct rx_sp_node 00077 { 00078 void * key; 00079 void * data; 00080 struct rx_sp_node * kids[2]; 00081 }; 00082 00083 #ifdef __STDC__ 00084 typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *); 00085 #else 00086 typedef void (*rx_sp_key_data_freer) (); 00087 #endif 00088 00089 00090 /* giant inflatable hash trees */ 00091 00092 struct rx_hash_item 00093 { 00094 struct rx_hash_item * next_same_hash; 00095 struct rx_hash * table; 00096 unsigned long hash; 00097 void * data; 00098 void * binding; 00099 }; 00100 00101 struct rx_hash 00102 { 00103 struct rx_hash * parent; 00104 int refs; 00105 struct rx_hash * children[13]; 00106 struct rx_hash_item * buckets [13]; 00107 int bucket_size [13]; 00108 }; 00109 00110 struct rx_hash_rules; 00111 00112 #ifdef __STDC__ 00113 /* should return like == */ 00114 typedef int (*rx_hash_eq)(void *, void *); 00115 typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *); 00116 typedef void (*rx_free_hash)(struct rx_hash *, 00117 struct rx_hash_rules *); 00118 typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *, 00119 void *); 00120 typedef void (*rx_free_hash_item)(struct rx_hash_item *, 00121 struct rx_hash_rules *); 00122 #else 00123 typedef int (*rx_hash_eq)(); 00124 typedef struct rx_hash * (*rx_alloc_hash)(); 00125 typedef void (*rx_free_hash)(); 00126 typedef struct rx_hash_item * (*rx_alloc_hash_item)(); 00127 typedef void (*rx_free_hash_item)(); 00128 #endif 00129 00130 struct rx_hash_rules 00131 { 00132 rx_hash_eq eq; 00133 rx_alloc_hash hash_alloc; 00134 rx_free_hash free_hash; 00135 rx_alloc_hash_item hash_item_alloc; 00136 rx_free_hash_item free_hash_item; 00137 }; 00138 00139 00140 /* Forward declarations */ 00141 00142 struct rx_cache; 00143 struct rx_superset; 00144 struct rx; 00145 struct rx_se_list; 00146 00147 00148 00149 /* 00150 * GLOSSARY 00151 * 00152 * regexp 00153 * regular expression 00154 * expression 00155 * pattern - a `regular' expression. The expression 00156 * need not be formally regular -- it can contain 00157 * constructs that don't correspond to purely regular 00158 * expressions. 00159 * 00160 * buffer 00161 * string - the string (or strings) being searched or matched. 00162 * 00163 * pattern buffer - a structure of type `struct re_pattern_buffer' 00164 * This in turn contains a `struct rx', which holds the 00165 * NFA compiled from a pattern, as well as some of the state 00166 * of a matcher using the pattern. 00167 * 00168 * NFA - nondeterministic finite automata. Some people 00169 * use this term to a member of the class of 00170 * regular automata (those corresponding to a regular 00171 * language). However, in this code, the meaning is 00172 * more general. The automata used by Rx are comperable 00173 * in power to what are usually called `push down automata'. 00174 * 00175 * Two NFA are built by rx for every pattern. One is built 00176 * by the compiler. The other is built from the first, on 00177 * the fly, by the matcher. The latter is called the `superstate 00178 * NFA' because its states correspond to sets of states from 00179 * the first NFA. (Joe Keane gets credit for the name 00180 * `superstate NFA'). 00181 * 00182 * NFA edges 00183 * epsilon edges 00184 * side-effect edges - The NFA compiled from a pattern can have three 00185 * kinds of edges. Epsilon edges can be taken freely anytime 00186 * their source state is reached. Character set edges can be 00187 * taken when their source state is reached and when the next 00188 * character in the buffer is a member of the set. Side effect 00189 * edges imply a transition that can only be taken after the 00190 * indicated side effect has been successfully accomplished. 00191 * Some examples of side effects are: 00192 * 00193 * Storing the current match position to record the 00194 * location of a parentesized subexpression. 00195 * 00196 * Advancing the matcher over N characters if they 00197 * match the N characters previously matched by a 00198 * parentesized subexpression. 00199 * 00200 * Both of those kinds of edges occur in the NFA generated 00201 * by the pattern: \(.\)\1 00202 * 00203 * Epsilon and side effect edges are similar. Unfortunately, 00204 * some of the code uses the name `epsilon edge' to mean 00205 * both epsilon and side effect edges. For example, the 00206 * function has_non_idempotent_epsilon_path computes the existance 00207 * of a non-trivial path containing only a mix of epsilon and 00208 * side effect edges. In that case `nonidempotent epsilon' is being 00209 * used to mean `side effect'. 00210 */ 00211 00212 00213 00214 00215 00216 /* LOW LEVEL PATTERN BUFFERS */ 00217 00218 /* Suppose that from some NFA state, more than one path through 00219 * side-effect edges is possible. In what order should the paths 00220 * be tried? A function of type rx_se_list_order answers that 00221 * question. It compares two lists of side effects, and says 00222 * which list comes first. 00223 */ 00224 00225 #ifdef __STDC__ 00226 typedef int (*rx_se_list_order) (struct rx *, 00227 struct rx_se_list *, 00228 struct rx_se_list *); 00229 #else 00230 typedef int (*rx_se_list_order) (); 00231 #endif 00232 00233 00234 00235 /* Struct RX holds a compiled regular expression - that is, an nfa 00236 * ready to be converted on demand to a more efficient superstate nfa. 00237 * This is for the low level interface. The high-level interfaces enclose 00238 * this in a `struct re_pattern_buffer'. 00239 */ 00240 struct rx 00241 { 00242 /* The compiler assigns a unique id to every pattern. 00243 * Like sequence numbers in X, there is a subtle bug here 00244 * if you use Rx in a system that runs for a long time. 00245 * But, because of the way the caches work out, it is almost 00246 * impossible to trigger the Rx version of this bug. 00247 * 00248 * The id is used to validate superstates found in a cache 00249 * of superstates. It isn't sufficient to let a superstate 00250 * point back to the rx for which it was compiled -- the caller 00251 * may be re-using a `struct rx' in which case the superstate 00252 * is not really valid. So instead, superstates are validated 00253 * by checking the sequence number of the pattern for which 00254 * they were built. 00255 */ 00256 int rx_id; 00257 00258 /* This is memory mgt. state for superstates. This may be 00259 * shared by more than one struct rx. 00260 */ 00261 struct rx_cache * cache; 00262 00263 /* Every regex defines the size of its own character set. 00264 * A superstate has an array of this size, with each element 00265 * a `struct rx_inx'. So, don't make this number too large. 00266 * In particular, don't make it 2^16. 00267 */ 00268 int local_cset_size; 00269 00270 /* After the NFA is built, it is copied into a contiguous region 00271 * of memory (mostly for compatability with GNU regex). 00272 * Here is that region, and it's size: 00273 */ 00274 void * buffer; 00275 unsigned long allocated; 00276 00277 /* Clients of RX can ask for some extra storage in the space pointed 00278 * to by BUFFER. The field RESERVED is an input parameter to the 00279 * compiler. After compilation, this much space will be available 00280 * at (buffer + allocated - reserved) 00281 */ 00282 unsigned long reserved; 00283 00284 /* --------- The remaining fields are for internal use only. --------- */ 00285 /* --------- But! they must be initialized to 0. --------- */ 00286 00287 /* NODEC is the number of nodes in the NFA with non-epsilon 00288 * transitions. 00289 */ 00290 int nodec; 00291 00292 /* EPSNODEC is the number of nodes with only epsilon transitions. */ 00293 int epsnodec; 00294 00295 /* The sum (NODEC + EPSNODEC) is the total number of states in the 00296 * compiled NFA. 00297 */ 00298 00299 /* Lists of side effects as stored in the NFA are `hash consed'..meaning 00300 * that lists with the same elements are ==. During compilation, 00301 * this table facilitates hash-consing. 00302 */ 00303 struct rx_hash se_list_memo; 00304 00305 /* Lists of NFA states are also hashed. 00306 */ 00307 struct rx_hash set_list_memo; 00308 00309 00310 00311 00312 /* The compiler and matcher must build a number of instruction frames. 00313 * The format of these frames is fixed (c.f. struct rx_inx). The values 00314 * of the instructions is not fixed. 00315 * 00316 * An enumerated type (enum rx_opcode) defines the set of instructions 00317 * that the compiler or matcher might generate. When filling an instruction 00318 * frame, the INX field is found by indexing this instruction table 00319 * with an opcode: 00320 */ 00321 void ** instruction_table; 00322 00323 /* The list of all states in an NFA. 00324 * During compilation, the NEXT field of NFA states links this list. 00325 * After compilation, all the states are compacted into an array, 00326 * ordered by state id numbers. At that time, this points to the base 00327 * of that array. 00328 */ 00329 struct rx_nfa_state *nfa_states; 00330 00331 /* Every nfa begins with one distinguished starting state: 00332 */ 00333 struct rx_nfa_state *start; 00334 00335 /* This orders the search through super-nfa paths. 00336 * See the comment near the typedef of rx_se_list_order. 00337 */ 00338 rx_se_list_order se_list_cmp; 00339 00340 struct rx_superset * start_set; 00341 }; 00342 00343 00344 00345 00346 /* SYNTAX TREES */ 00347 00348 /* Compilation is in stages. 00349 * 00350 * In the first stage, a pattern specified by a string is 00351 * translated into a syntax tree. Later stages will convert 00352 * the syntax tree into an NFA optimized for conversion to a 00353 * superstate-NFA. 00354 * 00355 * This page is about syntax trees. 00356 */ 00357 00358 enum rexp_node_type 00359 { 00360 r_cset, /* Match from a character set. `a' or `[a-z]'*/ 00361 r_concat, /* Concat two subexpressions. `ab' */ 00362 r_alternate, /* Choose one of two subexpressions. `a\|b' */ 00363 r_opt, /* Optional subexpression. `a?' */ 00364 r_star, /* Repeated subexpression. `a*' */ 00365 00366 00367 /* A 2phase-star is a variation on a repeated subexpression. 00368 * In this case, there are two subexpressions. The first, if matched, 00369 * begins a repitition (otherwise, the whole expression is matches the 00370 * empth string). 00371 * 00372 * After matching the first subexpression, a 2phase star either finishes, 00373 * or matches the second subexpression. If the second subexpression is 00374 * matched, then the whole construct repeats. 00375 * 00376 * 2phase stars are used in two circumstances. First, they 00377 * are used as part of the implementation of POSIX intervals (counted 00378 * repititions). Second, they are used to implement proper star 00379 * semantics when the repeated subexpression contains paths of 00380 * only side effects. See rx_compile for more information. 00381 */ 00382 r_2phase_star, 00383 00384 00385 /* c.f. "typedef void * rx_side_effect" */ 00386 r_side_effect, 00387 00388 /* This is an extension type: It is for transient use in source->source 00389 * transformations (implemented over syntax trees). 00390 */ 00391 r_data 00392 }; 00393 00394 /* A side effect is a matcher-specific action associated with 00395 * transitions in the NFA. The details of side effects are up 00396 * to the matcher. To the compiler and superstate constructors 00397 * side effects are opaque: 00398 */ 00399 00400 typedef void * rx_side_effect; 00401 00402 /* Nodes in a syntax tree are of this type: 00403 */ 00404 struct rexp_node 00405 { 00406 enum rexp_node_type type; 00407 union 00408 { 00409 rx_Bitset cset; 00410 rx_side_effect side_effect; 00411 struct 00412 { 00413 struct rexp_node *left; 00414 struct rexp_node *right; 00415 } pair; 00416 void * data; 00417 } params; 00418 }; 00419 00420 00421 00422 /* NFA 00423 * 00424 * A syntax tree is compiled into an NFA. This page defines the structure 00425 * of that NFA. 00426 */ 00427 00428 struct rx_nfa_state 00429 { 00430 /* These are kept in a list as the NFA is being built. */ 00431 struct rx_nfa_state *next; 00432 00433 /* After the NFA is built, states are given integer id's. 00434 * States whose outgoing transitions are all either epsilon or 00435 * side effect edges are given ids less than 0. Other states 00436 * are given successive non-negative ids starting from 0. 00437 */ 00438 int id; 00439 00440 /* The list of NFA edges that go from this state to some other. */ 00441 struct rx_nfa_edge *edges; 00442 00443 /* If you land in this state, then you implicitly land 00444 * in all other states reachable by only epsilon translations. 00445 * Call the set of maximal paths to such states the epsilon closure 00446 * of this state. 00447 * 00448 * There may be other states that are reachable by a mixture of 00449 * epsilon and side effect edges. Consider the set of maximal paths 00450 * of that sort from this state. Call it the epsilon-side-effect 00451 * closure of the state. 00452 * 00453 * The epsilon closure of the state is a subset of the epsilon-side- 00454 * effect closure. It consists of all the paths that contain 00455 * no side effects -- only epsilon edges. 00456 * 00457 * The paths in the epsilon-side-effect closure can be partitioned 00458 * into equivalance sets. Two paths are equivalant if they have the 00459 * same set of side effects, in the same order. The epsilon-closure 00460 * is one of these equivalance sets. Let's call these equivalance 00461 * sets: observably equivalant path sets. That name is chosen 00462 * because equivalance of two paths means they cause the same side 00463 * effects -- so they lead to the same subsequent observations other 00464 * than that they may wind up in different target states. 00465 * 00466 * The superstate nfa, which is derived from this nfa, is based on 00467 * the observation that all of the paths in an observably equivalant 00468 * path set can be explored at the same time, provided that the 00469 * matcher keeps track not of a single nfa state, but of a set of 00470 * states. In particular, after following all the paths in an 00471 * observably equivalant set, you wind up at a set of target states. 00472 * That set of target states corresponds to one state in the 00473 * superstate NFA. 00474 * 00475 * Staticly, before matching begins, it is convenient to analyze the 00476 * nfa. Each state is labeled with a list of the observably 00477 * equivalant path sets who's union covers all the 00478 * epsilon-side-effect paths beginning in this state. This list is 00479 * called the possible futures of the state. 00480 * 00481 * A trivial example is this NFA: 00482 * s1 00483 * A ---> B 00484 * 00485 * s2 00486 * ---> C 00487 * 00488 * epsilon s1 00489 * ---------> D ------> E 00490 * 00491 * 00492 * In this example, A has two possible futures. 00493 * One invokes the side effect `s1' and contains two paths, 00494 * one ending in state B, the other in state E. 00495 * The other invokes the side effect `s2' and contains only 00496 * one path, landing in state C. 00497 */ 00498 struct rx_possible_future *futures; 00499 00500 00501 /* There are exactly two distinguished states in every NFA: */ 00502 unsigned int is_final:1; 00503 unsigned int is_start:1; 00504 00505 /* These are used during NFA construction... */ 00506 unsigned int eclosure_needed:1; 00507 unsigned int mark:1; 00508 }; 00509 00510 00511 /* An edge in an NFA is typed: */ 00512 enum rx_nfa_etype 00513 { 00514 /* A cset edge is labled with a set of characters one of which 00515 * must be matched for the edge to be taken. 00516 */ 00517 ne_cset, 00518 00519 /* An epsilon edge is taken whenever its starting state is 00520 * reached. 00521 */ 00522 ne_epsilon, 00523 00524 /* A side effect edge is taken whenever its starting state is 00525 * reached. Side effects may cause the match to fail or the 00526 * position of the matcher to advance. 00527 */ 00528 ne_side_effect /* A special kind of epsilon. */ 00529 }; 00530 00531 struct rx_nfa_edge 00532 { 00533 struct rx_nfa_edge *next; 00534 enum rx_nfa_etype type; 00535 struct rx_nfa_state *dest; 00536 union 00537 { 00538 rx_Bitset cset; 00539 rx_side_effect side_effect; 00540 } params; 00541 }; 00542 00543 00544 00545 /* A possible future consists of a list of side effects 00546 * and a set of destination states. Below are their 00547 * representations. These structures are hash-consed which 00548 * means that lists with the same elements share a representation 00549 * (their addresses are ==). 00550 */ 00551 00552 struct rx_nfa_state_set 00553 { 00554 struct rx_nfa_state * car; 00555 struct rx_nfa_state_set * cdr; 00556 }; 00557 00558 struct rx_se_list 00559 { 00560 rx_side_effect car; 00561 struct rx_se_list * cdr; 00562 }; 00563 00564 struct rx_possible_future 00565 { 00566 struct rx_possible_future *next; 00567 struct rx_se_list * effects; 00568 struct rx_nfa_state_set * destset; 00569 }; 00570 00571 00572 00573 /* This begins the description of the superstate NFA. 00574 * 00575 * The superstate NFA corresponds to the NFA in these ways: 00576 * 00577 * Every superstate NFA states SUPER correspond to sets of NFA states, 00578 * nfa_states(SUPER). 00579 * 00580 * Superstate edges correspond to NFA paths. 00581 * 00582 * The superstate has no epsilon transitions; 00583 * every edge has a character label, and a (possibly empty) side 00584 * effect label. The side effect label corresponds to a list of 00585 * side effects that occur in the NFA. These parts are referred 00586 * to as: superedge_character(EDGE) and superedge_sides(EDGE). 00587 * 00588 * For a superstate edge EDGE starting in some superstate SUPER, 00589 * the following is true (in pseudo-notation :-): 00590 * 00591 * exists DEST in nfa_states s.t. 00592 * exists nfaEDGE in nfa_edges s.t. 00593 * origin (nfaEDGE) == DEST 00594 * && origin (nfaEDGE) is a member of nfa_states(SUPER) 00595 * && exists PF in possible_futures(dest(nfaEDGE)) s.t. 00596 * sides_of_possible_future (PF) == superedge_sides (EDGE) 00597 * 00598 * also: 00599 * 00600 * let SUPER2 := superedge_destination(EDGE) 00601 * nfa_states(SUPER2) 00602 * == union of all nfa state sets S s.t. 00603 * exists PF in possible_futures(dest(nfaEDGE)) s.t. 00604 * sides_of_possible_future (PF) == superedge_sides (EDGE) 00605 * && S == dests_of_possible_future (PF) } 00606 * 00607 * Or in english, every superstate is a set of nfa states. A given 00608 * character and a superstate implies many transitions in the NFA -- 00609 * those that begin with an edge labeled with that character from a 00610 * state in the set corresponding to the superstate. 00611 * 00612 * The destinations of those transitions each have a set of possible 00613 * futures. A possible future is a list of side effects and a set of 00614 * destination NFA states. Two sets of possible futures can be 00615 * `merged' by combining all pairs of possible futures that have the 00616 * same side effects. A pair is combined by creating a new future 00617 * with the same side effect but the union of the two destination sets. 00618 * In this way, all the possible futures suggested by a superstate 00619 * and a character can be merged into a set of possible futures where 00620 * no two elements of the set have the same set of side effects. 00621 * 00622 * The destination of a possible future, being a set of NFA states, 00623 * corresponds to a supernfa state. So, the merged set of possible 00624 * futures we just created can serve as a set of edges in the 00625 * supernfa. 00626 * 00627 * The representation of the superstate nfa and the nfa is critical. 00628 * The nfa has to be compact, but has to facilitate the rapid 00629 * computation of missing superstates. The superstate nfa has to 00630 * be fast to interpret, lazilly constructed, and bounded in space. 00631 * 00632 * To facilitate interpretation, the superstate data structures are 00633 * peppered with `instruction frames'. There is an instruction set 00634 * defined below which matchers using the supernfa must be able to 00635 * interpret. 00636 * 00637 * We'd like to make it possible but not mandatory to use code 00638 * addresses to represent instructions (c.f. gcc's computed goto). 00639 * Therefore, we define an enumerated type of opcodes, and when 00640 * writing one of these instructions into a data structure, use 00641 * the opcode as an index into a table of instruction values. 00642 * 00643 * Here are the opcodes that occur in the superstate nfa: 00644 */ 00645 00646 00647 /* Every superstate contains a table of instruction frames indexed 00648 * by characters. A normal `move' in a matcher is to fetch the next 00649 * character and use it as an index into a superstates transition 00650 * table. 00651 * 00652 * In the fasted case, only one edge follows from that character. 00653 * In other cases there is more work to do. 00654 * 00655 * The descriptions of the opcodes refer to data structures that are 00656 * described further below. 00657 */ 00658 00659 enum rx_opcode 00660 { 00661 /* 00662 * BACKTRACK_POINT is invoked when a character transition in 00663 * a superstate leads to more than one edge. In that case, 00664 * the edges have to be explored independently using a backtracking 00665 * strategy. 00666 * 00667 * A BACKTRACK_POINT instruction is stored in a superstate's 00668 * transition table for some character when it is known that that 00669 * character crosses more than one edge. On encountering this 00670 * instruction, the matcher saves enough state to backtrack to this 00671 * point in the match later. 00672 */ 00673 rx_backtrack_point = 0, /* data is (struct transition_class *) */ 00674 00675 /* 00676 * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. 00677 * There is one occurence of this instruction per rx_distinct_future. 00678 * This instruction is skipped if a rx_distinct_future has no side effects. 00679 */ 00680 rx_do_side_effects = rx_backtrack_point + 1, 00681 00682 /* data is (struct rx_distinct_future *) */ 00683 00684 /* 00685 * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose 00686 * destination superstate has been reclaimed (or was never built). 00687 * It recomputes the destination superstate. 00688 * RX_CACHE_MISS is also stored in a superstate transition table before 00689 * any of its edges have been built. 00690 */ 00691 rx_cache_miss = rx_do_side_effects + 1, 00692 /* data is (struct rx_distinct_future *) */ 00693 00694 /* 00695 * RX_NEXT_CHAR is called to consume the next character and take the 00696 * corresponding transition. This is the only instruction that uses 00697 * the DATA field of the instruction frame instead of DATA_2. 00698 * (see EXPLORE_FUTURE in regex.c). 00699 */ 00700 rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ 00701 00702 /* RX_BACKTRACK indicates that a transition fails. 00703 */ 00704 rx_backtrack = rx_next_char + 1, /* no data */ 00705 00706 /* 00707 * RX_ERROR_INX is stored only in places that should never be executed. 00708 */ 00709 rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ 00710 00711 rx_num_instructions = rx_error_inx + 1 00712 }; 00713 00714 /* An id_instruction_table holds the values stored in instruction 00715 * frames. The table is indexed by the enums declared above. 00716 */ 00717 extern void * rx_id_instruction_table[rx_num_instructions]; 00718 00719 /* The heart of the matcher is a `word-code-interpreter' 00720 * (like a byte-code interpreter, except that instructions 00721 * are a full word wide). 00722 * 00723 * Instructions are not stored in a vector of code, instead, 00724 * they are scattered throughout the data structures built 00725 * by the regexp compiler and the matcher. One word-code instruction, 00726 * together with the arguments to that instruction, constitute 00727 * an instruction frame (struct rx_inx). 00728 * 00729 * This structure type is padded by hand to a power of 2 because 00730 * in one of the dominant cases, we dispatch by indexing a table 00731 * of instruction frames. If that indexing can be accomplished 00732 * by just a shift of the index, we're happy. 00733 * 00734 * Instructions take at most one argument, but there are two 00735 * slots in an instruction frame that might hold that argument. 00736 * These are called data and data_2. The data slot is only 00737 * used for one instruction (RX_NEXT_CHAR). For all other 00738 * instructions, data should be set to 0. 00739 * 00740 * RX_NEXT_CHAR is the most important instruction by far. 00741 * By reserving the data field for its exclusive use, 00742 * instruction dispatch is sped up in that case. There is 00743 * no need to fetch both the instruction and the data, 00744 * only the data is needed. In other words, a `cycle' begins 00745 * by fetching the field data. If that is non-0, then it must 00746 * be the destination state of a next_char transition, so 00747 * make that value the current state, advance the match position 00748 * by one character, and start a new cycle. On the other hand, 00749 * if data is 0, fetch the instruction and do a more complicated 00750 * dispatch on that. 00751 */ 00752 00753 struct rx_inx 00754 { 00755 void * data; 00756 void * data_2; 00757 void * inx; 00758 void * fnord; 00759 }; 00760 00761 #ifndef RX_TAIL_ARRAY 00762 #define RX_TAIL_ARRAY 1 00763 #endif 00764 00765 /* A superstate corresponds to a set of nfa states. Those sets are 00766 * represented by STRUCT RX_SUPERSET. The constructors 00767 * guarantee that only one (shared) structure is created for a given set. 00768 */ 00769 struct rx_superset 00770 { 00771 int refs; /* This is a reference counted structure. */ 00772 00773 /* We keep these sets in a cache because (in an unpredictable way), 00774 * the same set is often created again and again. But that is also 00775 * problematic -- compatibility with POSIX and GNU regex requires 00776 * that we not be able to tell when a program discards a particular 00777 * NFA (thus invalidating the supersets created from it). 00778 * 00779 * But when a cache hit appears to occur, we will have in hand the 00780 * nfa for which it may have happened. That is why every nfa is given 00781 * its own sequence number. On a cache hit, the cache is validated 00782 * by comparing the nfa sequence number to this field: 00783 */ 00784 int id; 00785 00786 struct rx_nfa_state * car; /* May or may not be a valid addr. */ 00787 struct rx_superset * cdr; 00788 00789 /* If the corresponding superstate exists: */ 00790 struct rx_superstate * superstate; 00791 00792 00793 /* There is another bookkeeping problem. It is expensive to 00794 * compute the starting nfa state set for an nfa. So, once computed, 00795 * it is cached in the `struct rx'. 00796 * 00797 * But, the state set can be flushed from the superstate cache. 00798 * When that happens, we can't know if the corresponding `struct rx' 00799 * is still alive or if it has been freed or re-used by the program. 00800 * So, the cached pointer to this set in a struct rx might be invalid 00801 * and we need a way to validate it. 00802 * 00803 * Fortunately, even if this set is flushed from the cache, it is 00804 * not freed. It just goes on the free-list of supersets. 00805 * So we can still examine it. 00806 * 00807 * So to validate a starting set memo, check to see if the 00808 * starts_for field still points back to the struct rx in question, 00809 * and if the ID matches the rx sequence number. 00810 */ 00811 struct rx * starts_for; 00812 00813 /* This is used to link into a hash bucket so these objects can 00814 * be `hash-consed'. 00815 */ 00816 struct rx_hash_item hash_item; 00817 }; 00818 00819 #define rx_protect_superset(RX,CON) (++(CON)->refs) 00820 00821 /* The terminology may be confusing (rename this structure?). 00822 * Every character occurs in at most one rx_super_edge per super-state. 00823 * But, that structure might have more than one option, indicating a point 00824 * of non-determinism. 00825 * 00826 * In other words, this structure holds a list of superstate edges 00827 * sharing a common starting state and character label. The edges 00828 * are in the field OPTIONS. All superstate edges sharing the same 00829 * starting state and character are in this list. 00830 */ 00831 struct rx_super_edge 00832 { 00833 struct rx_super_edge *next; 00834 struct rx_inx rx_backtrack_frame; 00835 int cset_size; 00836 rx_Bitset cset; 00837 struct rx_distinct_future *options; 00838 }; 00839 00840 /* A superstate is a set of nfa states (RX_SUPERSET) along 00841 * with a transition table. Superstates are built on demand and reclaimed 00842 * without warning. To protect a superstate from this ghastly fate, 00843 * use LOCK_SUPERSTATE. 00844 */ 00845 struct rx_superstate 00846 { 00847 int rx_id; /* c.f. the id field of rx_superset */ 00848 int locks; /* protection from reclamation */ 00849 00850 /* Within a superstate cache, all the superstates are kept in a big 00851 * queue. The tail of the queue is the state most likely to be 00852 * reclaimed. The *recyclable fields hold the queue position of 00853 * this state. 00854 */ 00855 struct rx_superstate * next_recyclable; 00856 struct rx_superstate * prev_recyclable; 00857 00858 /* The supernfa edges that exist in the cache and that have 00859 * this state as their destination are kept in this list: 00860 */ 00861 struct rx_distinct_future * transition_refs; 00862 00863 /* The list of nfa states corresponding to this superstate: */ 00864 struct rx_superset * contents; 00865 00866 /* The list of edges in the cache beginning from this state. */ 00867 struct rx_super_edge * edges; 00868 00869 /* A tail of the recyclable queue is marked as semifree. A semifree 00870 * state has no incoming next_char transitions -- any transition 00871 * into a semifree state causes a complex dispatch with the side 00872 * effect of rescuing the state from its semifree state. 00873 * 00874 * An alternative to this might be to make next_char more expensive, 00875 * and to move a state to the head of the recyclable queue whenever 00876 * it is entered. That way, popular states would never be recycled. 00877 * 00878 * But unilaterally making next_char more expensive actually loses. 00879 * So, incoming transitions are only made expensive for states near 00880 * the tail of the recyclable queue. The more cache contention 00881 * there is, the more frequently a state will have to prove itself 00882 * and be moved back to the front of the queue. If there is less 00883 * contention, then popular states just aggregate in the front of 00884 * the queue and stay there. 00885 */ 00886 int is_semifree; 00887 00888 00889 /* This keeps track of the size of the transition table for this 00890 * state. There is a half-hearted attempt to support variable sized 00891 * superstates. 00892 */ 00893 int trans_size; 00894 00895 /* Indexed by characters... */ 00896 struct rx_inx transitions[RX_TAIL_ARRAY]; 00897 }; 00898 00899 00900 /* A list of distinct futures define the edges that leave from a 00901 * given superstate on a given character. c.f. rx_super_edge. 00902 */ 00903 00904 struct rx_distinct_future 00905 { 00906 struct rx_distinct_future * next_same_super_edge[2]; 00907 struct rx_distinct_future * next_same_dest; 00908 struct rx_distinct_future * prev_same_dest; 00909 struct rx_superstate * present; /* source state */ 00910 struct rx_superstate * future; /* destination state */ 00911 struct rx_super_edge * edge; 00912 00913 00914 /* The future_frame holds the instruction that should be executed 00915 * after all the side effects are done, when it is time to complete 00916 * the transition to the next state. 00917 * 00918 * Normally this is a next_char instruction, but it may be a 00919 * cache_miss instruction as well, depending on whether or not 00920 * the superstate is in the cache and semifree. 00921 * 00922 * If this is the only future for a given superstate/char, and 00923 * if there are no side effects to be performed, this frame is 00924 * not used (directly) at all. Instead, its contents are copied 00925 * into the transition table of the starting state of this dist. future. 00926 */ 00927 struct rx_inx future_frame; 00928 00929 struct rx_inx side_effects_frame; 00930 struct rx_se_list * effects; 00931 }; 00932 00933 #define rx_lock_superstate(R,S) ((S)->locks++) 00934 #define rx_unlock_superstate(R,S) (--(S)->locks) 00935 00936 00937 /* This page destined for rx.h */ 00938 00939 struct rx_blocklist 00940 { 00941 struct rx_blocklist * next; 00942 int bytes; 00943 }; 00944 00945 struct rx_freelist 00946 { 00947 struct rx_freelist * next; 00948 }; 00949 00950 struct rx_cache; 00951 00952 #ifdef __STDC__ 00953 typedef void (*rx_morecore_fn)(struct rx_cache *); 00954 #else 00955 typedef void (*rx_morecore_fn)(); 00956 #endif 00957 00958 /* You use this to control the allocation of superstate data 00959 * during matching. Most of it should be initialized to 0. 00960 * 00961 * A MORECORE function is necessary. It should allocate 00962 * a new block of memory or return 0. 00963 * A default that uses malloc is called `rx_morecore'. 00964 * 00965 * The number of SUPERSTATES_ALLOWED indirectly limits how much memory 00966 * the system will try to allocate. The default is 128. Batch style 00967 * applications that are very regexp intensive should use as high a number 00968 * as possible without thrashing. 00969 * 00970 * The LOCAL_CSET_SIZE is the number of characters in a character set. 00971 * It is therefore the number of entries in a superstate transition table. 00972 * Generally, it should be 256. If your character set has 16 bits, 00973 * it is better to translate your regexps into equivalent 8 bit patterns. 00974 */ 00975 00976 struct rx_cache 00977 { 00978 struct rx_hash_rules superset_hash_rules; 00979 00980 /* Objects are allocated by incrementing a pointer that 00981 * scans across rx_blocklists. 00982 */ 00983 struct rx_blocklist * memory; 00984 struct rx_blocklist * memory_pos; 00985 int bytes_left; 00986 char * memory_addr; 00987 rx_morecore_fn morecore; 00988 00989 /* Freelists. */ 00990 struct rx_freelist * free_superstates; 00991 struct rx_freelist * free_transition_classes; 00992 struct rx_freelist * free_discernable_futures; 00993 struct rx_freelist * free_supersets; 00994 struct rx_freelist * free_hash; 00995 00996 /* Two sets of superstates -- those that are semifreed, and those 00997 * that are being used. 00998 */ 00999 struct rx_superstate * lru_superstate; 01000 struct rx_superstate * semifree_superstate; 01001 01002 struct rx_superset * empty_superset; 01003 01004 int superstates; 01005 int semifree_superstates; 01006 int hits; 01007 int misses; 01008 int superstates_allowed; 01009 01010 int local_cset_size; 01011 void ** instruction_table; 01012 01013 struct rx_hash superset_table; 01014 }; 01015 01016 01017 01018 /* The lowest-level search function supports arbitrarily fragmented 01019 * strings and (optionally) suspendable/resumable searches. 01020 * 01021 * Callers have to provide a few hooks. 01022 */ 01023 01024 #ifndef __GNUC__ 01025 #ifdef __STDC__ 01026 #define __const__ const 01027 #else 01028 #define __const__ 01029 #endif 01030 #endif 01031 01032 /* This holds a matcher position */ 01033 struct rx_string_position 01034 { 01035 __const__ unsigned char * pos; /* The current pos. */ 01036 __const__ unsigned char * string; /* The current string burst. */ 01037 __const__ unsigned char * end; /* First invalid position >= POS. */ 01038 int offset; /* Integer address of the current burst. */ 01039 int size; /* Current string's size. */ 01040 int search_direction; /* 1 or -1 */ 01041 int search_end; /* First position to not try. */ 01042 }; 01043 01044 01045 enum rx_get_burst_return 01046 { 01047 rx_get_burst_continuation, 01048 rx_get_burst_error, 01049 rx_get_burst_ok, 01050 rx_get_burst_no_more 01051 }; 01052 01053 01054 /* A call to get burst should make POS valid. It might be invalid 01055 * if the STRING field doesn't point to a burst that actually 01056 * contains POS. 01057 * 01058 * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to 01059 * whether or not to pad to the left. Padding to the right is always 01060 * appropriate, but need not go past the point indicated by STOP. 01061 * 01062 * If a continuation is returned, then the reentering call to 01063 * a search function will retry the get_burst. 01064 */ 01065 01066 #ifdef __STDC__ 01067 typedef enum rx_get_burst_return 01068 (*rx_get_burst_fn) (struct rx_string_position * pos, 01069 void * app_closure, 01070 int stop); 01071 01072 #else 01073 typedef enum rx_get_burst_return (*rx_get_burst_fn) (); 01074 #endif 01075 01076 01077 enum rx_back_check_return 01078 { 01079 rx_back_check_continuation, 01080 rx_back_check_error, 01081 rx_back_check_pass, 01082 rx_back_check_fail 01083 }; 01084 01085 /* Back_check should advance the position it is passed 01086 * over rparen - lparen characters and return pass iff 01087 * the characters starting at POS match those indexed 01088 * by [LPAREN..RPAREN]. 01089 * 01090 * If a continuation is returned, then the reentering call to 01091 * a search function will retry the back_check. 01092 */ 01093 01094 #ifdef __STDC__ 01095 typedef enum rx_back_check_return 01096 (*rx_back_check_fn) (struct rx_string_position * pos, 01097 int lparen, 01098 int rparen, 01099 unsigned char * translate, 01100 void * app_closure, 01101 int stop); 01102 01103 #else 01104 typedef enum rx_back_check_return (*rx_back_check_fn) (); 01105 #endif 01106 01107 01108 01109 01110 /* A call to fetch_char should return the character at POS or POS + 1. 01111 * Returning continuations here isn't supported. OFFSET is either 0 or 1 01112 * and indicates which characters is desired. 01113 */ 01114 01115 #ifdef __STDC__ 01116 typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos, 01117 int offset, 01118 void * app_closure, 01119 int stop); 01120 #else 01121 typedef int (*rx_fetch_char_fn) (); 01122 #endif 01123 01124 01125 enum rx_search_return 01126 { 01127 rx_search_continuation = -4, 01128 rx_search_error = -3, 01129 rx_search_soft_fail = -2, /* failed by running out of string */ 01130 rx_search_fail = -1 /* failed only by reaching failure states */ 01131 /* return values >= 0 indicate the position of a successful match */ 01132 }; 01133 01134 01135 01136 01137 01138 01139 /* regex.h 01140 * 01141 * The remaining declarations replace regex.h. 01142 */ 01143 01144 /* This is an array of error messages corresponding to the error codes. 01145 */ 01146 extern __const__ char *re_error_msg[]; 01147 01148 /* If any error codes are removed, changed, or added, update the 01149 `re_error_msg' table in regex.c. */ 01150 typedef enum 01151 { 01152 REG_NOERROR = 0, /* Success. */ 01153 REG_NOMATCH, /* Didn't find a match (for regexec). */ 01154 01155 /* POSIX regcomp return error codes. (In the order listed in the 01156 standard.) */ 01157 REG_BADPAT, /* Invalid pattern. */ 01158 REG_ECOLLATE, /* Not implemented. */ 01159 REG_ECTYPE, /* Invalid character class name. */ 01160 REG_EESCAPE, /* Trailing backslash. */ 01161 REG_ESUBREG, /* Invalid back reference. */ 01162 REG_EBRACK, /* Unmatched left bracket. */ 01163 REG_EPAREN, /* Parenthesis imbalance. */ 01164 REG_EBRACE, /* Unmatched \{. */ 01165 REG_BADBR, /* Invalid contents of \{\}. */ 01166 REG_ERANGE, /* Invalid range end. */ 01167 REG_ESPACE, /* Ran out of memory. */ 01168 REG_BADRPT, /* No preceding re for repetition op. */ 01169 01170 /* Error codes we've added. */ 01171 REG_EEND, /* Premature end. */ 01172 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ 01173 REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ 01174 } reg_errcode_t; 01175 01176 /* The regex.c support, as a client of rx, defines a set of possible 01177 * side effects that can be added to the edge lables of nfa edges. 01178 * Here is the list of sidef effects in use. 01179 */ 01180 01181 enum re_side_effects 01182 { 01183 #define RX_WANT_SE_DEFS 1 01184 #undef RX_DEF_SE 01185 #undef RX_DEF_CPLX_SE 01186 #define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE, 01187 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE, 01188 #include "MNrx.h" 01189 #undef RX_DEF_SE 01190 #undef RX_DEF_CPLX_SE 01191 #undef RX_WANT_SE_DEFS 01192 re_floogle_flap = 65533 01193 }; 01194 01195 /* These hold paramaters for the kinds of side effects that are possible 01196 * in the supported pattern languages. These include things like the 01197 * numeric bounds of {} operators and the index of paren registers for 01198 * subexpression measurement or backreferencing. 01199 */ 01200 struct re_se_params 01201 { 01202 enum re_side_effects se; 01203 int op1; 01204 int op2; 01205 }; 01206 01207 typedef unsigned reg_syntax_t; 01208 01209 struct re_pattern_buffer 01210 { 01211 struct rx rx; 01212 reg_syntax_t syntax; /* See below for syntax bit definitions. */ 01213 01214 unsigned int no_sub:1; /* If set, don't return register offsets. */ 01215 unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */ 01216 unsigned int not_eol:1; /* match at the ends of the string. */ 01217 unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/ 01218 unsigned int least_subs:1; /* If set, and returning registers, return 01219 * as few values as possible. Only 01220 * backreferenced groups and group 0 (the whole 01221 * match) will be returned. 01222 */ 01223 01224 /* If true, this says that the matcher should keep registers on its 01225 * backtracking stack. For many patterns, we can easily determine that 01226 * this isn't necessary. 01227 */ 01228 unsigned int match_regs_on_stack:1; 01229 unsigned int search_regs_on_stack:1; 01230 01231 /* is_anchored and begbuf_only are filled in by rx_compile. */ 01232 unsigned int is_anchored:1; /* Anchorded by ^? */ 01233 unsigned int begbuf_only:1; /* Anchored to char position 0? */ 01234 01235 01236 /* If REGS_UNALLOCATED, allocate space in the `regs' structure 01237 * for `max (RE_NREGS, re_nsub + 1)' groups. 01238 * If REGS_REALLOCATE, reallocate space if necessary. 01239 * If REGS_FIXED, use what's there. 01240 */ 01241 #define REGS_UNALLOCATED 0 01242 #define REGS_REALLOCATE 1 01243 #define REGS_FIXED 2 01244 unsigned int regs_allocated:2; 01245 01246 01247 /* Either a translate table to apply to all characters before 01248 * comparing them, or zero for no translation. The translation 01249 * is applied to a pattern when it is compiled and to a string 01250 * when it is matched. 01251 */ 01252 unsigned char * translate; 01253 01254 /* If this is a valid pointer, it tells rx not to store the extents of 01255 * certain subexpressions (those corresponding to non-zero entries). 01256 * Passing 0x1 is the same as passing an array of all ones. Passing 0x0 01257 * is the same as passing an array of all zeros. 01258 * The array should contain as many entries as their are subexps in the 01259 * regexp. 01260 * 01261 * For POSIX compatability, when using regcomp and regexec this field 01262 * is zeroed and ignored. 01263 */ 01264 char * syntax_parens; 01265 01266 /* Number of subexpressions found by the compiler. */ 01267 size_t re_nsub; 01268 01269 void * buffer; /* Malloced memory for the nfa. */ 01270 unsigned long allocated; /* Size of that memory. */ 01271 01272 /* Pointer to a fastmap, if any, otherwise zero. re_search uses 01273 * the fastmap, if there is one, to skip over impossible 01274 * starting points for matches. */ 01275 char *fastmap; 01276 01277 unsigned int fastmap_accurate:1; /* These three are internal. */ 01278 unsigned int can_match_empty:1; 01279 struct rx_nfa_state * start; /* The nfa starting state. */ 01280 01281 /* This is the list of iterator bounds for {lo,hi} constructs. 01282 * The memory pointed to is part of the rx->buffer. 01283 */ 01284 struct re_se_params *se_params; 01285 01286 /* This is a bitset representation of the fastmap. 01287 * This is a true fastmap that already takes the translate 01288 * table into account. 01289 */ 01290 rx_Bitset fastset; 01291 }; 01292 01293 /* Type for byte offsets within the string. POSIX mandates this. */ 01294 typedef int regoff_t; 01295 01296 /* This is the structure we store register match data in. See 01297 regex.texinfo for a full description of what registers match. */ 01298 struct re_registers 01299 { 01300 unsigned num_regs; 01301 regoff_t *start; 01302 regoff_t *end; 01303 }; 01304 01305 typedef struct re_pattern_buffer regex_t; 01306 01307 /* POSIX specification for registers. Aside from the different names than 01308 `re_registers', POSIX uses an array of structures, instead of a 01309 structure of arrays. */ 01310 typedef struct 01311 { 01312 regoff_t rm_so; /* Byte offset from string's start to substring's start. */ 01313 regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ 01314 } regmatch_t; 01315 01316 01317 /* The following bits are used to determine the regexp syntax we 01318 recognize. The set/not-set meanings are chosen so that Emacs syntax 01319 remains the value 0. The bits are given in alphabetical order, and 01320 the definitions shifted by one from the previous bit; thus, when we 01321 add or remove a bit, only one other definition need change. */ 01322 01323 /* If this bit is not set, then \ inside a bracket expression is literal. 01324 If set, then such a \ quotes the following character. */ 01325 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1) 01326 01327 /* If this bit is not set, then + and ? are operators, and \+ and \? are 01328 literals. 01329 If set, then \+ and \? are operators and + and ? are literals. */ 01330 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) 01331 01332 /* If this bit is set, then character classes are supported. They are: 01333 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], 01334 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. 01335 If not set, then character classes are not supported. */ 01336 #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) 01337 01338 /* If this bit is set, then ^ and $ are always anchors (outside bracket 01339 expressions, of course). 01340 If this bit is not set, then it depends: 01341 ^ is an anchor if it is at the beginning of a regular 01342 expression or after an open-group or an alternation operator; 01343 $ is an anchor if it is at the end of a regular expression, or 01344 before a close-group or an alternation operator. 01345 01346 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because 01347 POSIX draft 11.2 says that * etc. in leading positions is undefined. 01348 We already implemented a previous draft which made those constructs 01349 invalid, though, so we haven't changed the code back. */ 01350 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) 01351 01352 /* If this bit is set, then special characters are always special 01353 regardless of where they are in the pattern. 01354 If this bit is not set, then special characters are special only in 01355 some contexts; otherwise they are ordinary. Specifically, 01356 * + ? and intervals are only special when not after the beginning, 01357 open-group, or alternation operator. */ 01358 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) 01359 01360 /* If this bit is set, then *, +, ?, and { cannot be first in an re or 01361 immediately after an alternation or begin-group operator. */ 01362 #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) 01363 01364 /* If this bit is set, then . matches newline. 01365 If not set, then it doesn't. */ 01366 #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) 01367 01368 /* If this bit is set, then . doesn't match NUL. 01369 If not set, then it does. */ 01370 #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) 01371 01372 /* If this bit is set, nonmatching lists [^...] do not match newline. 01373 If not set, they do. */ 01374 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) 01375 01376 /* If this bit is set, either \{...\} or {...} defines an 01377 interval, depending on RE_NO_BK_BRACES. 01378 If not set, \{, \}, {, and } are literals. */ 01379 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) 01380 01381 /* If this bit is set, +, ? and | aren't recognized as operators. 01382 If not set, they are. */ 01383 #define RE_LIMITED_OPS (RE_INTERVALS << 1) 01384 01385 /* If this bit is set, newline is an alternation operator. 01386 If not set, newline is literal. */ 01387 #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) 01388 01389 /* If this bit is set, then `{...}' defines an interval, and \{ and \} 01390 are literals. 01391 If not set, then `\{...\}' defines an interval. */ 01392 #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) 01393 01394 /* If this bit is set, (...) defines a group, and \( and \) are literals. 01395 If not set, \(...\) defines a group, and ( and ) are literals. */ 01396 #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) 01397 01398 /* If this bit is set, then <digit> matches <digit>. 01399 If not set, then <digit> is a back-reference. */ 01400 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) 01401 01402 /* If this bit is set, then | is an alternation operator, and \| is literal. 01403 If not set, then \| is an alternation operator, and | is literal. */ 01404 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) 01405 01406 /* If this bit is set, then an ending range point collating higher 01407 than the starting range point, as in [z-a], is invalid. 01408 If not set, then when ending range point collates higher than the 01409 starting range point, the range is ignored. */ 01410 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) 01411 01412 /* If this bit is set, then an unmatched ) is ordinary. 01413 If not set, then an unmatched ) is invalid. */ 01414 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) 01415 01416 /* This global variable defines the particular regexp syntax to use (for 01417 some interfaces). When a regexp is compiled, the syntax used is 01418 stored in the pattern buffer, so changing this does not affect 01419 already-compiled regexps. */ 01420 extern reg_syntax_t re_syntax_options; 01421 01422 /* Define combinations of the above bits for the standard possibilities. 01423 (The [[[ comments delimit what gets put into the Texinfo file, so 01424 don't delete them!) */ 01425 /* [[[begin syntaxes]]] */ 01426 #define RE_SYNTAX_EMACS 0 01427 01428 #define RE_SYNTAX_AWK \ 01429 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ 01430 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 01431 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ 01432 | RE_UNMATCHED_RIGHT_PAREN_ORD) 01433 01434 #define RE_SYNTAX_POSIX_AWK \ 01435 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) 01436 01437 #define RE_SYNTAX_GREP \ 01438 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ 01439 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ 01440 | RE_NEWLINE_ALT) 01441 01442 #define RE_SYNTAX_EGREP \ 01443 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ 01444 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ 01445 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ 01446 | RE_NO_BK_VBAR) 01447 01448 #define RE_SYNTAX_POSIX_EGREP \ 01449 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) 01450 01451 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC 01452 01453 /* Syntax bits common to both basic and extended POSIX regex syntax. */ 01454 #define _RE_SYNTAX_POSIX_COMMON \ 01455 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ 01456 | RE_INTERVALS | RE_NO_EMPTY_RANGES) 01457 01458 #define RE_SYNTAX_POSIX_BASIC \ 01459 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) 01460 01461 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes 01462 RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this 01463 isn't minimal, since other operators, such as \`, aren't disabled. */ 01464 #define RE_SYNTAX_POSIX_MINIMAL_BASIC \ 01465 (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) 01466 01467 #define RE_SYNTAX_POSIX_EXTENDED \ 01468 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 01469 | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ 01470 | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ 01471 | RE_UNMATCHED_RIGHT_PAREN_ORD) 01472 01473 /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS 01474 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ 01475 #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ 01476 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 01477 | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ 01478 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 01479 | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) 01480 /* [[[end syntaxes]]] */ 01481 01482 /* Maximum number of duplicates an interval can allow. Some systems 01483 (erroneously) define this in other header files, but we want our 01484 value, so remove any previous define. */ 01485 #ifdef RE_DUP_MAX 01486 #undef RE_DUP_MAX 01487 #endif 01488 #define RE_DUP_MAX ((1 << 15) - 1) 01489 01490 01491 01492 /* POSIX `cflags' bits (i.e., information for `regcomp'). */ 01493 01494 /* If this bit is set, then use extended regular expression syntax. 01495 If not set, then use basic regular expression syntax. */ 01496 #define REG_EXTENDED 1 01497 01498 /* If this bit is set, then ignore case when matching. 01499 If not set, then case is significant. */ 01500 #define REG_ICASE (REG_EXTENDED << 1) 01501 01502 /* If this bit is set, then anchors do not match at newline 01503 characters in the string. 01504 If not set, then anchors do match at newlines. */ 01505 #define REG_NEWLINE (REG_ICASE << 1) 01506 01507 /* If this bit is set, then report only success or fail in regexec. 01508 If not set, then returns differ between not matching and errors. */ 01509 #define REG_NOSUB (REG_NEWLINE << 1) 01510 01511 01512 /* POSIX `eflags' bits (i.e., information for regexec). */ 01513 01514 /* If this bit is set, then the beginning-of-line operator doesn't match 01515 the beginning of the string (presumably because it's not the 01516 beginning of a line). 01517 If not set, then the beginning-of-line operator does match the 01518 beginning of the string. */ 01519 #define REG_NOTBOL 1 01520 01521 /* Like REG_NOTBOL, except for the end-of-line. */ 01522 #define REG_NOTEOL (1 << 1) 01523 01524 /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, 01525 * `re_match_2' returns information about at least this many registers 01526 * the first time a `regs' structure is passed. 01527 * 01528 * Also, this is the greatest number of backreferenced subexpressions 01529 * allowed in a pattern being matched without caller-supplied registers. 01530 */ 01531 #ifndef RE_NREGS 01532 #define RE_NREGS 30 01533 #endif 01534 01535 extern int rx_cache_bound; 01536 extern char rx_version_string[]; 01537 01538 01539 01540 #ifdef RX_WANT_RX_DEFS 01541 01542 /* This is decls to the interesting subsystems and lower layers 01543 * of rx. Everything which doesn't have a public counterpart in 01544 * regex.c is declared here. 01545 */ 01546 01547 01548 #ifdef __STDC__ 01549 typedef void (*rx_hash_freefn) (struct rx_hash_item * it); 01550 #else /* ndef __STDC__ */ 01551 typedef void (*rx_hash_freefn) (); 01552 #endif /* ndef __STDC__ */ 01553 01554 01555 01556 01557 #ifdef __STDC__ 01558 RX_DECL int rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b); 01559 RX_DECL int rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b); 01560 RX_DECL int rx_bitset_empty (int size, rx_Bitset set); 01561 RX_DECL void rx_bitset_null (int size, rx_Bitset b); 01562 RX_DECL void rx_bitset_universe (int size, rx_Bitset b); 01563 RX_DECL void rx_bitset_complement (int size, rx_Bitset b); 01564 RX_DECL void rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b); 01565 RX_DECL void rx_bitset_union (int size, rx_Bitset a, rx_Bitset b); 01566 RX_DECL void rx_bitset_intersection (int size, 01567 rx_Bitset a, rx_Bitset b); 01568 RX_DECL void rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b); 01569 RX_DECL void rx_bitset_revdifference (int size, 01570 rx_Bitset a, rx_Bitset b); 01571 RX_DECL void rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b); 01572 RX_DECL unsigned long rx_bitset_hash (int size, rx_Bitset b); 01573 RX_DECL struct rx_hash_item * rx_hash_find (struct rx_hash * table, 01574 unsigned long hash, 01575 void * value, 01576 struct rx_hash_rules * rules); 01577 RX_DECL struct rx_hash_item * rx_hash_store (struct rx_hash * table, 01578 unsigned long hash, 01579 void * value, 01580 struct rx_hash_rules * rules); 01581 RX_DECL void rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules); 01582 RX_DECL void rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn, 01583 struct rx_hash_rules * rules); 01584 RX_DECL rx_Bitset rx_cset (struct rx *rx); 01585 RX_DECL rx_Bitset rx_copy_cset (struct rx *rx, rx_Bitset a); 01586 RX_DECL void rx_free_cset (struct rx * rx, rx_Bitset c); 01587 RX_DECL struct rexp_node * rexp_node (struct rx *rx, 01588 enum rexp_node_type type); 01589 RX_DECL struct rexp_node * rx_mk_r_cset (struct rx * rx, 01590 rx_Bitset b); 01591 RX_DECL struct rexp_node * rx_mk_r_concat (struct rx * rx, 01592 struct rexp_node * a, 01593 struct rexp_node * b); 01594 RX_DECL struct rexp_node * rx_mk_r_alternate (struct rx * rx, 01595 struct rexp_node * a, 01596 struct rexp_node * b); 01597 RX_DECL struct rexp_node * rx_mk_r_opt (struct rx * rx, 01598 struct rexp_node * a); 01599 RX_DECL struct rexp_node * rx_mk_r_star (struct rx * rx, 01600 struct rexp_node * a); 01601 RX_DECL struct rexp_node * rx_mk_r_2phase_star (struct rx * rx, 01602 struct rexp_node * a, 01603 struct rexp_node * b); 01604 RX_DECL struct rexp_node * rx_mk_r_side_effect (struct rx * rx, 01605 rx_side_effect a); 01606 RX_DECL struct rexp_node * rx_mk_r_data (struct rx * rx, 01607 void * a); 01608 RX_DECL void rx_free_rexp (struct rx * rx, struct rexp_node * node); 01609 RX_DECL struct rexp_node * rx_copy_rexp (struct rx *rx, 01610 struct rexp_node *node); 01611 RX_DECL struct rx_nfa_state * rx_nfa_state (struct rx *rx); 01612 RX_DECL void rx_free_nfa_state (struct rx_nfa_state * n); 01613 RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (struct rx * rx, 01614 int id); 01615 RX_DECL struct rx_nfa_edge * rx_nfa_edge (struct rx *rx, 01616 enum rx_nfa_etype type, 01617 struct rx_nfa_state *start, 01618 struct rx_nfa_state *dest); 01619 RX_DECL void rx_free_nfa_edge (struct rx_nfa_edge * e); 01620 RX_DECL void rx_free_nfa (struct rx *rx); 01621 RX_DECL int rx_build_nfa (struct rx *rx, 01622 struct rexp_node *rexp, 01623 struct rx_nfa_state **start, 01624 struct rx_nfa_state **end); 01625 RX_DECL void rx_name_nfa_states (struct rx *rx); 01626 RX_DECL int rx_eclose_nfa (struct rx *rx); 01627 RX_DECL void rx_delete_epsilon_transitions (struct rx *rx); 01628 RX_DECL int rx_compactify_nfa (struct rx *rx, 01629 void **mem, unsigned long *size); 01630 RX_DECL void rx_release_superset (struct rx *rx, 01631 struct rx_superset *set); 01632 RX_DECL struct rx_superset * rx_superset_cons (struct rx * rx, 01633 struct rx_nfa_state *car, struct rx_superset *cdr); 01634 RX_DECL struct rx_superset * rx_superstate_eclosure_union 01635 (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl); 01636 RX_DECL struct rx_superstate * rx_superstate (struct rx *rx, 01637 struct rx_superset *set); 01638 RX_DECL struct rx_inx * rx_handle_cache_miss 01639 (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data); 01640 RX_DECL reg_errcode_t rx_compile (__const__ char *pattern, int size, 01641 reg_syntax_t syntax, 01642 struct re_pattern_buffer * rxb); 01643 RX_DECL void rx_blow_up_fastmap (struct re_pattern_buffer * rxb); 01644 #else /* STDC */ 01645 RX_DECL int rx_bitset_is_equal (); 01646 RX_DECL int rx_bitset_is_subset (); 01647 RX_DECL int rx_bitset_empty (); 01648 RX_DECL void rx_bitset_null (); 01649 RX_DECL void rx_bitset_universe (); 01650 RX_DECL void rx_bitset_complement (); 01651 RX_DECL void rx_bitset_assign (); 01652 RX_DECL void rx_bitset_union (); 01653 RX_DECL void rx_bitset_intersection (); 01654 RX_DECL void rx_bitset_difference (); 01655 RX_DECL void rx_bitset_revdifference (); 01656 RX_DECL void rx_bitset_xor (); 01657 RX_DECL unsigned long rx_bitset_hash (); 01658 RX_DECL struct rx_hash_item * rx_hash_find (); 01659 RX_DECL struct rx_hash_item * rx_hash_store (); 01660 RX_DECL void rx_hash_free (); 01661 RX_DECL void rx_free_hash_table (); 01662 RX_DECL rx_Bitset rx_cset (); 01663 RX_DECL rx_Bitset rx_copy_cset (); 01664 RX_DECL void rx_free_cset (); 01665 RX_DECL struct rexp_node * rexp_node (); 01666 RX_DECL struct rexp_node * rx_mk_r_cset (); 01667 RX_DECL struct rexp_node * rx_mk_r_concat (); 01668 RX_DECL struct rexp_node * rx_mk_r_alternate (); 01669 RX_DECL struct rexp_node * rx_mk_r_opt (); 01670 RX_DECL struct rexp_node * rx_mk_r_star (); 01671 RX_DECL struct rexp_node * rx_mk_r_2phase_star (); 01672 RX_DECL struct rexp_node * rx_mk_r_side_effect (); 01673 RX_DECL struct rexp_node * rx_mk_r_data (); 01674 RX_DECL void rx_free_rexp (); 01675 RX_DECL struct rexp_node * rx_copy_rexp (); 01676 RX_DECL struct rx_nfa_state * rx_nfa_state (); 01677 RX_DECL void rx_free_nfa_state (); 01678 RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (); 01679 RX_DECL struct rx_nfa_edge * rx_nfa_edge (); 01680 RX_DECL void rx_free_nfa_edge (); 01681 RX_DECL void rx_free_nfa (); 01682 RX_DECL int rx_build_nfa (); 01683 RX_DECL void rx_name_nfa_states (); 01684 RX_DECL int rx_eclose_nfa (); 01685 RX_DECL void rx_delete_epsilon_transitions (); 01686 RX_DECL int rx_compactify_nfa (); 01687 RX_DECL void rx_release_superset (); 01688 RX_DECL struct rx_superset * rx_superset_cons (); 01689 RX_DECL struct rx_superset * rx_superstate_eclosure_union (); 01690 RX_DECL struct rx_superstate * rx_superstate (); 01691 RX_DECL struct rx_inx * rx_handle_cache_miss (); 01692 RX_DECL reg_errcode_t rx_compile (); 01693 RX_DECL void rx_blow_up_fastmap (); 01694 #endif /* STDC */ 01695 01696 01697 #endif /* RX_WANT_RX_DEFS */ 01698 01699 01700 01701 #ifdef __STDC__ 01702 extern int re_search_2 (struct re_pattern_buffer *rxb, 01703 __const__ char * string1, int size1, 01704 __const__ char * string2, int size2, 01705 int startpos, int range, 01706 struct re_registers *regs, 01707 int stop); 01708 extern int re_search (struct re_pattern_buffer * rxb, __const__ char *string, 01709 int size, int startpos, int range, 01710 struct re_registers *regs); 01711 extern int re_match_2 (struct re_pattern_buffer * rxb, 01712 __const__ char * string1, int size1, 01713 __const__ char * string2, int size2, 01714 int pos, struct re_registers *regs, int stop); 01715 extern int re_match (struct re_pattern_buffer * rxb, 01716 __const__ char * string, 01717 int size, int pos, 01718 struct re_registers *regs); 01719 extern reg_syntax_t re_set_syntax (reg_syntax_t syntax); 01720 extern void re_set_registers (struct re_pattern_buffer *bufp, 01721 struct re_registers *regs, 01722 unsigned num_regs, 01723 regoff_t * starts, regoff_t * ends); 01724 extern __const__ char * re_compile_pattern (__const__ char *pattern, 01725 int length, 01726 struct re_pattern_buffer * rxb); 01727 extern int re_compile_fastmap (struct re_pattern_buffer * rxb); 01728 extern char * re_comp (__const__ char *s); 01729 extern int re_exec (__const__ char *s); 01730 extern int regcomp (regex_t * preg, __const__ char * pattern, int cflags); 01731 extern int regexec (__const__ regex_t *preg, __const__ char *string, 01732 size_t nmatch, regmatch_t pmatch[], 01733 int eflags); 01734 extern size_t regerror (int errcode, __const__ regex_t *preg, 01735 char *errbuf, size_t errbuf_size); 01736 extern void regfree (regex_t *preg); 01737 01738 #else /* STDC */ 01739 extern int re_search_2 (); 01740 extern int re_search (); 01741 extern int re_match_2 (); 01742 extern int re_match (); 01743 extern reg_syntax_t re_set_syntax (); 01744 extern void re_set_registers (); 01745 extern __const__ char * re_compile_pattern (); 01746 extern int re_compile_fastmap (); 01747 extern char * re_comp (); 01748 extern int re_exec (); 01749 extern int regcomp (); 01750 extern int regexec (); 01751 extern size_t regerror (); 01752 extern void regfree (); 01753 01754 #endif /* STDC */ 01755 01756 01757 01758 #ifdef RX_WANT_RX_DEFS 01759 01760 struct rx_counter_frame 01761 { 01762 int tag; 01763 int val; 01764 struct rx_counter_frame * inherited_from; /* If this is a copy. */ 01765 struct rx_counter_frame * cdr; 01766 }; 01767 01768 struct rx_backtrack_frame 01769 { 01770 char * counter_stack_sp; 01771 01772 /* A frame is used to save the matchers state when it crosses a 01773 * backtracking point. The `stk_' fields correspond to variables 01774 * in re_search_2 (just strip off thes `stk_'). They are documented 01775 * tere. 01776 */ 01777 struct rx_superstate * stk_super; 01778 unsigned int stk_c; 01779 struct rx_string_position stk_test_pos; 01780 int stk_last_l; 01781 int stk_last_r; 01782 int stk_test_ret; 01783 01784 /* This is the list of options left to explore at the backtrack 01785 * point for which this frame was created. 01786 */ 01787 struct rx_distinct_future * df; 01788 struct rx_distinct_future * first_df; 01789 01790 #ifdef RX_DEBUG 01791 int stk_line_no; 01792 #endif 01793 }; 01794 01795 struct rx_stack_chunk 01796 { 01797 struct rx_stack_chunk * next_chunk; 01798 int bytes_left; 01799 char * sp; 01800 }; 01801 01802 enum rx_outer_entry 01803 { 01804 rx_outer_start, 01805 rx_outer_fastmap, 01806 rx_outer_test, 01807 rx_outer_restore_pos 01808 }; 01809 01810 enum rx_fastmap_return 01811 { 01812 rx_fastmap_continuation, 01813 rx_fastmap_error, 01814 rx_fastmap_ok, 01815 rx_fastmap_fail 01816 }; 01817 01818 enum rx_fastmap_entry 01819 { 01820 rx_fastmap_start, 01821 rx_fastmap_string_break 01822 }; 01823 01824 enum rx_test_return 01825 { 01826 rx_test_continuation, 01827 rx_test_error, 01828 rx_test_fail, 01829 rx_test_ok 01830 }; 01831 01832 enum rx_test_internal_return 01833 { 01834 rx_test_internal_error, 01835 rx_test_found_first, 01836 rx_test_line_finished 01837 }; 01838 01839 enum rx_test_match_entry 01840 { 01841 rx_test_start, 01842 rx_test_cache_hit_loop, 01843 rx_test_backreference_check, 01844 rx_test_backtrack_return 01845 }; 01846 01847 struct rx_search_state 01848 { 01849 /* Two groups of registers are kept. The group with the register state 01850 * of the current test match, and the group that holds the state at the end 01851 * of the best known match, if any. 01852 * 01853 * For some patterns, there may also be registers saved on the stack. 01854 */ 01855 unsigned num_regs; /* Includes an element for register zero. */ 01856 regoff_t * lparen; /* scratch space for register returns */ 01857 regoff_t * rparen; 01858 regoff_t * best_lpspace; /* in case the user doesn't want these */ 01859 regoff_t * best_rpspace; /* values, we still need space to store 01860 * them. Normally, this memoryis unused 01861 * and the space pointed to by REGS is 01862 * used instead. 01863 */ 01864 01865 int last_l; /* Highest index of a valid lparen. */ 01866 int last_r; /* It's dual. */ 01867 01868 int * best_lparen; /* This contains the best known register */ 01869 int * best_rparen; /* assignments. 01870 * This may point to the same mem as 01871 * best_lpspace, or it might point to memory 01872 * passed by the caller. 01873 */ 01874 int best_last_l; /* best_last_l:best_lparen::last_l:lparen */ 01875 int best_last_r; 01876 01877 01878 unsigned char * translate; 01879 01880 struct rx_string_position outer_pos; 01881 01882 struct rx_superstate * start_super; 01883 int nfa_choice; 01884 int first_found; /* If true, return after finding any match. */ 01885 int ret_val; 01886 01887 /* For continuations... */ 01888 enum rx_outer_entry outer_search_resume_pt; 01889 struct re_pattern_buffer * saved_rxb; 01890 int saved_startpos; 01891 int saved_range; 01892 int saved_stop; 01893 int saved_total_size; 01894 rx_get_burst_fn saved_get_burst; 01895 rx_back_check_fn saved_back_check; 01896 struct re_registers * saved_regs; 01897 01901 char * fastmap; 01902 int fastmap_chr; 01903 int fastmap_val; 01904 01905 /* for continuations in the fastmap procedure: */ 01906 enum rx_fastmap_entry fastmap_resume_pt; 01907 01912 /* The current superNFA position of the matcher. */ 01913 struct rx_superstate * super; 01914 01915 /* The matcher interprets a series of instruction frames. 01916 * This is the `instruction counter' for the interpretation. 01917 */ 01918 struct rx_inx * ifr; 01919 01920 /* We insert a ghost character in the string to prime 01921 * the nfa. test_pos.pos, test_pos.str_half, and test_pos.end_half 01922 * keep track of the test-match position and string-half. 01923 */ 01924 unsigned char c; 01925 01926 /* Position within the string. */ 01927 struct rx_string_position test_pos; 01928 01929 struct rx_stack_chunk * counter_stack; 01930 struct rx_stack_chunk * backtrack_stack; 01931 int backtrack_frame_bytes; 01932 int chunk_bytes; 01933 struct rx_stack_chunk * free_chunks; 01934 01935 /* To return from this function, set test_ret and 01936 * `goto test_do_return'. 01937 * 01938 * Possible return values are: 01939 * 1 --- end of string while the superNFA is still going 01940 * 0 --- internal error (out of memory) 01941 * -1 --- search completed by reaching the superNFA fail state 01942 * -2 --- a match was found, maybe not the longest. 01943 * 01944 * When the search is complete (-1), best_last_r indicates whether 01945 * a match was found. 01946 * 01947 * -2 is return only if search_state.first_found is non-zero. 01948 * 01949 * if search_state.first_found is non-zero, a return of -1 indicates no match, 01950 * otherwise, best_last_r has to be checked. 01951 */ 01952 int test_ret; 01953 01954 int could_have_continued; 01955 01956 #ifdef RX_DEBUG 01957 int backtrack_depth; 01958 /* There is a search tree with every node as set of deterministic 01959 * transitions in the super nfa. For every branch of a 01960 * backtrack point is an edge in the tree. 01961 * This counts up a pre-order of nodes in that tree. 01962 * It's saved on the search stack and printed when debugging. 01963 */ 01964 int line_no; 01965 int lines_found; 01966 #endif 01967 01968 01969 /* For continuations within the match tester */ 01970 enum rx_test_match_entry test_match_resume_pt; 01971 struct rx_inx * saved_next_tr_table; 01972 struct rx_inx * saved_this_tr_table; 01973 int saved_reg; 01974 struct rx_backtrack_frame * saved_bf; 01975 01976 }; 01977 01978 01979 extern char rx_slowmap[]; 01980 extern unsigned char rx_id_translation[]; 01981 01982 static __inline__ void 01983 init_fastmap (rxb, search_state) 01984 struct re_pattern_buffer * rxb; 01985 struct rx_search_state * search_state; 01986 { 01987 search_state->fastmap = (rxb->fastmap 01988 ? (char *)rxb->fastmap 01989 : (char *)rx_slowmap); 01990 /* Update the fastmap now if not correct already. 01991 * When the regexp was compiled, the fastmap was computed 01992 * and stored in a bitset. This expands the bitset into a 01993 * character array containing 1s and 0s. 01994 */ 01995 if ((search_state->fastmap == rxb->fastmap) && !rxb->fastmap_accurate) 01996 rx_blow_up_fastmap (rxb); 01997 search_state->fastmap_chr = -1; 01998 search_state->fastmap_val = 0; 01999 search_state->fastmap_resume_pt = rx_fastmap_start; 02000 } 02001 02002 static __inline__ void 02003 uninit_fastmap (rxb, search_state) 02004 struct re_pattern_buffer * rxb; 02005 struct rx_search_state * search_state; 02006 { 02007 /* Unset the fastmap sentinel */ 02008 if (search_state->fastmap_chr >= 0) 02009 search_state->fastmap[search_state->fastmap_chr] 02010 = search_state->fastmap_val; 02011 } 02012 02013 static __inline__ int 02014 fastmap_search (rxb, stop, get_burst, app_closure, search_state) 02015 struct re_pattern_buffer * rxb; 02016 int stop; 02017 rx_get_burst_fn get_burst; 02018 void * app_closure; 02019 struct rx_search_state * search_state; 02020 { 02021 enum rx_fastmap_entry pc; 02022 02023 if (0) 02024 { 02025 return_continuation: 02026 search_state->fastmap_resume_pt = pc; 02027 return rx_fastmap_continuation; 02028 } 02029 02030 pc = search_state->fastmap_resume_pt; 02031 02032 switch (pc) 02033 { 02034 default: 02035 return rx_fastmap_error; 02036 case rx_fastmap_start: 02037 init_fastmap_sentinal: 02038 /* For the sake of fast fastmapping, set a sentinal in the fastmap. 02039 * This sentinal will trap the fastmap loop when it reaches the last 02040 * valid character in a string half. 02041 * 02042 * This must be reset when the fastmap/search loop crosses a string 02043 * boundry, and before returning to the caller. So sometimes, 02044 * the fastmap loop is restarted with `continue', othertimes by 02045 * `goto init_fastmap_sentinal'. 02046 */ 02047 if (search_state->outer_pos.size) 02048 { 02049 search_state->fastmap_chr = ((search_state->outer_pos.search_direction == 1) 02050 ? *(search_state->outer_pos.end - 1) 02051 : *search_state->outer_pos.string); 02052 search_state->fastmap_val 02053 = search_state->fastmap[search_state->fastmap_chr]; 02054 search_state->fastmap[search_state->fastmap_chr] = 1; 02055 } 02056 else 02057 { 02058 search_state->fastmap_chr = -1; 02059 search_state->fastmap_val = 0; 02060 } 02061 02062 if (search_state->outer_pos.pos >= search_state->outer_pos.end) 02063 goto fastmap_hit_bound; 02064 else 02065 { 02066 if (search_state->outer_pos.search_direction == 1) 02067 { 02068 if (search_state->fastmap_val) 02069 { 02070 for (;;) 02071 { 02072 while (!search_state->fastmap[*search_state->outer_pos.pos]) 02073 ++search_state->outer_pos.pos; 02074 return rx_fastmap_ok; 02075 } 02076 } 02077 else 02078 { 02079 for (;;) 02080 { 02081 while (!search_state->fastmap[*search_state->outer_pos.pos]) 02082 ++search_state->outer_pos.pos; 02083 if (*search_state->outer_pos.pos != search_state->fastmap_chr) 02084 return rx_fastmap_ok; 02085 else 02086 { 02087 ++search_state->outer_pos.pos; 02088 if (search_state->outer_pos.pos == search_state->outer_pos.end) 02089 goto fastmap_hit_bound; 02090 } 02091 } 02092 } 02093 } 02094 else 02095 { 02096 __const__ unsigned char * bound; 02097 bound = search_state->outer_pos.string - 1; 02098 if (search_state->fastmap_val) 02099 { 02100 for (;;) 02101 { 02102 while (!search_state->fastmap[*search_state->outer_pos.pos]) 02103 --search_state->outer_pos.pos; 02104 return rx_fastmap_ok; 02105 } 02106 } 02107 else 02108 { 02109 for (;;) 02110 { 02111 while (!search_state->fastmap[*search_state->outer_pos.pos]) 02112 --search_state->outer_pos.pos; 02113 if ((*search_state->outer_pos.pos != search_state->fastmap_chr) || search_state->fastmap_val) 02114 return rx_fastmap_ok; 02115 else 02116 { 02117 --search_state->outer_pos.pos; 02118 if (search_state->outer_pos.pos == bound) 02119 goto fastmap_hit_bound; 02120 } 02121 } 02122 } 02123 } 02124 } 02125 02126 case rx_fastmap_string_break: 02127 fastmap_hit_bound: 02128 { 02129 /* If we hit a bound, it may be time to fetch another burst 02130 * of string, or it may be time to return a continuation to 02131 * the caller, or it might be time to fail. 02132 */ 02133 02134 int burst_state; 02135 burst_state = get_burst (&search_state->outer_pos, app_closure, stop); 02136 switch (burst_state) 02137 { 02138 default: 02139 case rx_get_burst_error: 02140 return rx_fastmap_error; 02141 case rx_get_burst_continuation: 02142 { 02143 pc = rx_fastmap_string_break; 02144 goto return_continuation; 02145 } 02146 case rx_get_burst_ok: 02147 goto init_fastmap_sentinal; 02148 case rx_get_burst_no_more: 02149 /* ...not a string split, simply no more string. 02150 * 02151 * When searching backward, running out of string 02152 * is reason to quit. 02153 * 02154 * When searching forward, we allow the possibility 02155 * of an (empty) match after the last character in the 02156 * virtual string. So, fall through to the matcher 02157 */ 02158 return ( (search_state->outer_pos.search_direction == 1) 02159 ? rx_fastmap_ok 02160 : rx_fastmap_fail); 02161 } 02162 } 02163 } 02164 02165 } 02166 02167 02168 02169 #ifdef emacs 02170 #error this is not emacs 02171 // /* The `emacs' switch turns on certain matching commands 02172 // * that make sense only in Emacs. 02173 // */ 02174 // #include "config.h" 02175 // #include "lisp.h" 02176 // #include "buffer.h" 02177 // #include "syntax.h" 02178 #endif /* emacs */ 02179 02180 /* Setting RX_MEMDBUG is useful if you have dbmalloc. Maybe with similar 02181 * packages too. 02182 */ 02183 #ifdef RX_MEMDBUG 02184 #include <malloc.h> 02185 #endif /* RX_RX_MEMDBUG */ 02186 02187 /* We used to test for `BSTRING' here, but only GCC and Emacs define 02188 * `BSTRING', as far as I know, and neither of them use this code. 02189 */ 02190 #if HAVE_STRING_H || STDC_HEADERS 02191 #include <string.h> 02192 02193 #ifndef bcmp 02194 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 02195 #endif 02196 02197 #ifndef bcopy 02198 #define bcopy(s, d, n) memcpy ((d), (s), (n)) 02199 #endif 02200 02201 #ifndef bzero 02202 #define bzero(s, n) memset ((s), 0, (n)) 02203 #endif 02204 02205 #else /* HAVE_STRING_H || STDC_HEADERS */ 02206 #include <strings.h> 02207 #endif /* not (HAVE_STRING_H || STDC_HEADERS) */ 02208 02209 #ifdef STDC_HEADERS 02210 #include <stdlib.h> 02211 #else /* not STDC_HEADERS */ 02212 /* char *malloc (); 02213 * char *realloc (); 02214 */ 02215 #endif /* not STDC_HEADERS */ 02216 02217 02218 02219 02220 /* How many characters in the character set. */ 02221 #define CHAR_SET_SIZE (1 << CHARBITS) 02222 02223 #ifndef emacs 02224 /* Define the syntax basics for <, >, etc. 02225 * This must be nonzero for the wordchar and notwordchar pattern 02226 * commands in re_match_2. 02227 */ 02228 #ifndef Sword 02229 #define Sword 1 02230 #endif 02231 #define SYNTAX(c) re_syntax_table[c] 02232 RX_DECL char re_syntax_table[CHAR_SET_SIZE]; 02233 #endif /* not emacs */ 02234 02235 02236 /* Test if at very beginning or at very end of the virtual concatenation 02237 * of `string1' and `string2'. If only one string, it's `string2'. 02238 */ 02239 02240 #define AT_STRINGS_BEG() \ 02241 ( -1 \ 02242 == ((search_state.test_pos.pos - search_state.test_pos.string) \ 02243 + search_state.test_pos.offset)) 02244 02245 #define AT_STRINGS_END() \ 02246 ( (total_size - 1) \ 02247 == ((search_state.test_pos.pos - search_state.test_pos.string) \ 02248 + search_state.test_pos.offset)) 02249 02250 02251 /* Test if POS + 1 points to a character which is word-constituent. We have 02252 * two special cases to check for: if past the end of string1, look at 02253 * the first character in string2; and if before the beginning of 02254 * string2, look at the last character in string1. 02255 * 02256 * Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG (). 02257 */ 02258 #define LETTER_P(POS,OFF) \ 02259 ( SYNTAX (fetch_char(POS, OFF, app_closure, stop)) \ 02260 == Sword) 02261 02262 /* Test if the character at D and the one after D differ with respect 02263 * to being word-constituent. 02264 */ 02265 #define AT_WORD_BOUNDARY(d) \ 02266 (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d,0) != LETTER_P (d, 1)) 02267 02268 02269 #ifdef RX_SUPPORT_CONTINUATIONS 02270 #define RX_STACK_ALLOC(BYTES) malloc(BYTES) 02271 #define RX_STACK_FREE(MEM) free(MEM) 02272 #else 02273 #define RX_STACK_ALLOC(BYTES) alloca(BYTES) 02274 #define RX_STACK_FREE(MEM) \ 02275 ((struct rx_stack_chunk *)MEM)->next_chunk = search_state.free_chunks; \ 02276 search_state.free_chunks = ((struct rx_stack_chunk *)MEM); 02277 02278 #endif 02279 02280 #define PUSH(CHUNK_VAR,BYTES) \ 02281 if (!CHUNK_VAR || (CHUNK_VAR->bytes_left < (BYTES))) \ 02282 { \ 02283 struct rx_stack_chunk * new_chunk; \ 02284 if (search_state.free_chunks) \ 02285 { \ 02286 new_chunk = search_state.free_chunks; \ 02287 search_state.free_chunks = search_state.free_chunks->next_chunk; \ 02288 } \ 02289 else \ 02290 { \ 02291 new_chunk = (struct rx_stack_chunk *)RX_STACK_ALLOC(search_state.chunk_bytes); \ 02292 if (!new_chunk) \ 02293 { \ 02294 search_state.ret_val = 0; \ 02295 goto test_do_return; \ 02296 } \ 02297 } \ 02298 new_chunk->sp = (char *)new_chunk + sizeof (struct rx_stack_chunk); \ 02299 new_chunk->bytes_left = (search_state.chunk_bytes \ 02300 - (BYTES) \ 02301 - sizeof (struct rx_stack_chunk)); \ 02302 new_chunk->next_chunk = CHUNK_VAR; \ 02303 CHUNK_VAR = new_chunk; \ 02304 } \ 02305 else \ 02306 (CHUNK_VAR->sp += (BYTES)), (CHUNK_VAR->bytes_left -= (BYTES)) 02307 02308 #define POP(CHUNK_VAR,BYTES) \ 02309 if (CHUNK_VAR->sp == ((char *)CHUNK_VAR + sizeof(*CHUNK_VAR))) \ 02310 { \ 02311 struct rx_stack_chunk * new_chunk = CHUNK_VAR->next_chunk; \ 02312 RX_STACK_FREE(CHUNK_VAR); \ 02313 CHUNK_VAR = new_chunk; \ 02314 } \ 02315 else \ 02316 (CHUNK_VAR->sp -= BYTES), (CHUNK_VAR->bytes_left += BYTES) 02317 02318 02319 02320 #define SRCH_TRANSLATE(C) search_state.translate[(unsigned char) (C)] 02321 02322 02323 02324 02325 #ifdef __STDC__ 02326 RX_DECL __inline__ int 02327 rx_search (struct re_pattern_buffer * rxb, 02328 int startpos, 02329 int range, 02330 int stop, 02331 int total_size, 02332 rx_get_burst_fn get_burst, 02333 rx_back_check_fn back_check, 02334 rx_fetch_char_fn fetch_char, 02335 void * app_closure, 02336 struct re_registers * regs, 02337 struct rx_search_state * resume_state, 02338 struct rx_search_state * save_state) 02339 #else 02340 RX_DECL __inline__ int 02341 rx_search (rxb, startpos, range, stop, total_size, 02342 get_burst, back_check, fetch_char, 02343 app_closure, regs, resume_state, save_state) 02344 struct re_pattern_buffer * rxb; 02345 int startpos; 02346 int range; 02347 int stop; 02348 int total_size; 02349 rx_get_burst_fn get_burst; 02350 rx_back_check_fn back_check; 02351 rx_fetch_char_fn fetch_char; 02352 void * app_closure; 02353 struct re_registers * regs; 02354 struct rx_search_state * resume_state; 02355 struct rx_search_state * save_state; 02356 #endif 02357 { 02358 int pc; 02359 int test_state; 02360 struct rx_search_state search_state; 02361 02362 search_state.free_chunks = 0; 02363 if (!resume_state) 02364 pc = rx_outer_start; 02365 else 02366 { 02367 search_state = *resume_state; 02368 regs = search_state.saved_regs; 02369 rxb = search_state.saved_rxb; 02370 startpos = search_state.saved_startpos; 02371 range = search_state.saved_range; 02372 stop = search_state.saved_stop; 02373 total_size = search_state.saved_total_size; 02374 get_burst = search_state.saved_get_burst; 02375 back_check = search_state.saved_back_check; 02376 pc = search_state.outer_search_resume_pt; 02377 if (0) 02378 { 02379 return_continuation: 02380 if (save_state) 02381 { 02382 *save_state = search_state; 02383 save_state->saved_regs = regs; 02384 save_state->saved_rxb = rxb; 02385 save_state->saved_startpos = startpos; 02386 save_state->saved_range = range; 02387 save_state->saved_stop = stop; 02388 save_state->saved_total_size = total_size; 02389 save_state->saved_get_burst = get_burst; 02390 save_state->saved_back_check = back_check; 02391 save_state->outer_search_resume_pt = pc; 02392 } 02393 return rx_search_continuation; 02394 } 02395 } 02396 02397 switch (pc) 02398 { 02399 case rx_outer_start: 02400 search_state.ret_val = rx_search_fail; 02401 ( search_state.lparen 02402 = search_state.rparen 02403 = search_state.best_lpspace 02404 = search_state.best_rpspace 02405 = 0); 02406 02407 /* figure the number of registers we may need for use in backreferences. 02408 * the number here includes an element for register zero. 02409 */ 02410 search_state.num_regs = rxb->re_nsub + 1; 02411 02412 02413 /* check for out-of-range startpos. */ 02414 if ((startpos < 0) || (startpos > total_size)) 02415 return rx_search_fail; 02416 02417 /* fix up range if it might eventually take us outside the string. */ 02418 { 02419 int endpos; 02420 endpos = startpos + range; 02421 if (endpos < -1) 02422 range = (-1 - startpos); 02423 else if (endpos > (total_size + 1)) 02424 range = total_size - startpos; 02425 } 02426 02427 /* if the search isn't to be a backwards one, don't waste time in a 02428 * long search for a pattern that says it is anchored. 02429 */ 02430 if (rxb->begbuf_only && (range > 0)) 02431 { 02432 if (startpos > 0) 02433 return rx_search_fail; 02434 else 02435 range = 1; 02436 } 02437 02438 /* decide whether to use internal or user-provided reg buffers. */ 02439 if (!regs || rxb->no_sub) 02440 { 02441 search_state.best_lpspace = 02442 (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); 02443 search_state.best_rpspace = 02444 (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); 02445 search_state.best_lparen = search_state.best_lpspace; 02446 search_state.best_rparen = search_state.best_rpspace; 02447 } 02448 else 02449 { 02450 /* have the register data arrays been allocated? */ 02451 if (rxb->regs_allocated == REGS_UNALLOCATED) 02452 { /* no. so allocate them with malloc. we need one 02453 extra element beyond `search_state.num_regs' for the `-1' marker 02454 gnu code uses. */ 02455 regs->num_regs = MAX (RE_NREGS, rxb->re_nsub + 1); 02456 regs->start = ((regoff_t *) 02457 malloc (regs->num_regs * sizeof ( regoff_t))); 02458 regs->end = ((regoff_t *) 02459 malloc (regs->num_regs * sizeof ( regoff_t))); 02460 if (regs->start == 0 || regs->end == 0) 02461 return rx_search_error; 02462 rxb->regs_allocated = REGS_REALLOCATE; 02463 } 02464 else if (rxb->regs_allocated == REGS_REALLOCATE) 02465 { /* yes. if we need more elements than were already 02466 allocated, reallocate them. if we need fewer, just 02467 leave it alone. */ 02468 if (regs->num_regs < search_state.num_regs + 1) 02469 { 02470 regs->num_regs = search_state.num_regs + 1; 02471 regs->start = ((regoff_t *) 02472 realloc (regs->start, 02473 regs->num_regs * sizeof (regoff_t))); 02474 regs->end = ((regoff_t *) 02475 realloc (regs->end, 02476 regs->num_regs * sizeof ( regoff_t))); 02477 if (regs->start == 0 || regs->end == 0) 02478 return rx_search_error; 02479 } 02480 } 02481 else if (rxb->regs_allocated != REGS_FIXED) 02482 return rx_search_error; 02483 02484 if (regs->num_regs < search_state.num_regs + 1) 02485 { 02486 search_state.best_lpspace = 02487 ((regoff_t *) 02488 REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t))); 02489 search_state.best_rpspace = 02490 ((regoff_t *) 02491 REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t))); 02492 search_state.best_lparen = search_state.best_lpspace; 02493 search_state.best_rparen = search_state.best_rpspace; 02494 } 02495 else 02496 { 02497 search_state.best_lparen = regs->start; 02498 search_state.best_rparen = regs->end; 02499 } 02500 } 02501 02502 search_state.lparen = 02503 (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); 02504 search_state.rparen = 02505 (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); 02506 02507 if (! ( search_state.best_rparen 02508 && search_state.best_lparen 02509 && search_state.lparen && search_state.rparen)) 02510 return rx_search_error; 02511 02512 search_state.best_last_l = search_state.best_last_r = -1; 02513 02514 search_state.translate = (rxb->translate 02515 ? rxb->translate 02516 : rx_id_translation); 02517 02518 02519 02520 /* 02521 * two nfa's were compiled. 02522 * `0' is complete. 02523 * `1' faster but gets registers wrong and ends too soon. 02524 */ 02525 search_state.nfa_choice = (regs && !rxb->least_subs) ? '\0' : '\1'; 02526 02527 /* we have the option to look for the best match or the first 02528 * one we can find. if the user isn't asking for register information, 02529 * we don't need to find the best match. 02530 */ 02531 search_state.first_found = !regs; 02532 02533 if (range >= 0) 02534 { 02535 search_state.outer_pos.search_end = startpos + range; 02536 search_state.outer_pos.search_direction = 1; 02537 } 02538 else 02539 { 02540 search_state.outer_pos.search_end = startpos + range; 02541 search_state.outer_pos.search_direction = -1; 02542 } 02543 02544 /* the vacuous search always turns up nothing. */ 02545 if ((search_state.outer_pos.search_direction == 1) 02546 ? (startpos > search_state.outer_pos.search_end) 02547 : (startpos < search_state.outer_pos.search_end)) 02548 return rx_search_fail; 02549 02550 /* now we build the starting state of the supernfa. */ 02551 { 02552 struct rx_superset * start_contents; 02553 struct rx_nfa_state_set * start_nfa_set; 02554 02555 /* we presume here that the nfa start state has only one 02556 * possible future with no side effects. 02557 */ 02558 start_nfa_set = rxb->start->futures->destset; 02559 if ( rxb->rx.start_set 02560 && (rxb->rx.start_set->starts_for == &rxb->rx)) 02561 start_contents = rxb->rx.start_set; 02562 else 02563 { 02564 start_contents = 02565 rx_superstate_eclosure_union (&rxb->rx, 02566 rx_superset_cons (&rxb->rx, 0, 0), 02567 start_nfa_set); 02568 02569 if (!start_contents) 02570 return rx_search_fail; 02571 02572 start_contents->starts_for = &rxb->rx; 02573 rxb->rx.start_set = start_contents; 02574 } 02575 if ( start_contents->superstate 02576 && (start_contents->superstate->rx_id == rxb->rx.rx_id)) 02577 { 02578 search_state.start_super = start_contents->superstate; 02579 rx_lock_superstate (&rxb->rx, search_state.start_super); 02580 } 02581 else 02582 { 02583 rx_protect_superset (&rxb->rx, start_contents); 02584 02585 search_state.start_super = rx_superstate (&rxb->rx, start_contents); 02586 if (!search_state.start_super) 02587 return rx_search_fail; 02588 rx_lock_superstate (&rxb->rx, search_state.start_super); 02589 rx_release_superset (&rxb->rx, start_contents); 02590 } 02591 } 02592 02593 02594 /* The outer_pos tracks the position within the strings 02595 * as seen by loop that calls fastmap_search. 02596 * 02597 * The caller supplied get_burst function actually 02598 * gives us pointers to chars. 02599 * 02600 * Communication with the get_burst function is through an 02601 * rx_string_position structure. Here, the structure for 02602 * outer_pos is initialized. It is set to point to the 02603 * NULL string, at an offset of STARTPOS. STARTPOS is out 02604 * of range of the NULL string, so the first call to 02605 * getburst will patch up the rx_string_position to point 02606 * to valid characters. 02607 */ 02608 02609 ( search_state.outer_pos.string 02610 = search_state.outer_pos.end 02611 = 0); 02612 02613 search_state.outer_pos.offset = 0; 02614 search_state.outer_pos.size = 0; 02615 search_state.outer_pos.pos = (unsigned char *)startpos; 02616 init_fastmap (rxb, &search_state); 02617 02618 search_state.fastmap_resume_pt = rx_fastmap_start; 02619 case rx_outer_fastmap: 02620 /* do { */ 02621 pseudo_do: 02622 { 02623 { 02624 int fastmap_state; 02625 fastmap_state = fastmap_search (rxb, stop, get_burst, app_closure, 02626 &search_state); 02627 switch (fastmap_state) 02628 { 02629 case rx_fastmap_continuation: 02630 pc = rx_outer_fastmap; 02631 goto return_continuation; 02632 case rx_fastmap_fail: 02633 goto finish; 02634 case rx_fastmap_ok: 02635 break; 02636 } 02637 } 02638 02639 /* now the fastmap loop has brought us to a plausible 02640 * starting point for a match. so, it's time to run the 02641 * nfa and see if a match occured. 02642 */ 02643 startpos = ( search_state.outer_pos.pos 02644 - search_state.outer_pos.string 02645 + search_state.outer_pos.offset); 02646 #if 0 02647 /*|*/ if ((range > 0) && (startpos == search_state.outer_pos.search_end)) 02648 /*|*/ goto finish; 02649 #endif 02650 } 02651 02652 search_state.test_match_resume_pt = rx_test_start; 02653 /* do interrupted for entry point... */ 02654 case rx_outer_test: 02655 /* ...do continued */ 02656 { 02657 goto test_match; 02658 test_returns_to_search: 02659 switch (test_state) 02660 { 02661 case rx_test_continuation: 02662 pc = rx_outer_test; 02663 goto return_continuation; 02664 case rx_test_error: 02665 search_state.ret_val = rx_search_error; 02666 goto finish; 02667 case rx_test_fail: 02668 break; 02669 case rx_test_ok: 02670 goto finish; 02671 } 02672 search_state.outer_pos.pos += search_state.outer_pos.search_direction; 02673 startpos += search_state.outer_pos.search_direction; 02674 #if 0 02675 /*|*/ if (search_state.test_pos.pos < search_state.test_pos.end) 02676 /*|*/ break; 02677 #endif 02678 } 02679 /* do interrupted for entry point... */ 02680 case rx_outer_restore_pos: 02681 { 02682 int x; 02683 x = get_burst (&search_state.outer_pos, app_closure, stop); 02684 switch (x) 02685 { 02686 case rx_get_burst_continuation: 02687 pc = rx_outer_restore_pos; 02688 goto return_continuation; 02689 case rx_get_burst_error: 02690 search_state.ret_val = rx_search_error; 02691 goto finish; 02692 case rx_get_burst_no_more: 02693 if (rxb->can_match_empty) 02694 break; 02695 goto finish; 02696 case rx_get_burst_ok: 02697 break; 02698 } 02699 } /* } while (...see below...) */ 02700 02701 if ((search_state.outer_pos.search_direction == 1) 02702 ? (startpos <= search_state.outer_pos.search_end) 02703 : (startpos > search_state.outer_pos.search_end)) 02704 goto pseudo_do; 02705 02706 02707 finish: 02708 uninit_fastmap (rxb, &search_state); 02709 if (search_state.start_super) 02710 rx_unlock_superstate (&rxb->rx, search_state.start_super); 02711 02712 #ifdef regex_malloc 02713 if (search_state.lparen) free (search_state.lparen); 02714 if (search_state.rparen) free (search_state.rparen); 02715 if (search_state.best_lpspace) free (search_state.best_lpspace); 02716 if (search_state.best_rpspace) free (search_state.best_rpspace); 02717 #endif 02718 return search_state.ret_val; 02719 } 02720 02721 02722 test_match: 02723 { 02724 enum rx_test_match_entry test_pc; 02725 int inx; 02726 test_pc = search_state.test_match_resume_pt; 02727 if (test_pc == rx_test_start) 02728 { 02729 #ifdef RX_DEBUG 02730 search_state.backtrack_depth = 0; 02731 #endif 02732 search_state.last_l = search_state.last_r = 0; 02733 search_state.lparen[0] = startpos; 02734 search_state.super = search_state.start_super; 02735 search_state.c = search_state.nfa_choice; 02736 search_state.test_pos.pos = search_state.outer_pos.pos - 1; 02737 search_state.test_pos.string = search_state.outer_pos.string; 02738 search_state.test_pos.end = search_state.outer_pos.end; 02739 search_state.test_pos.offset = search_state.outer_pos.offset; 02740 search_state.test_pos.size = search_state.outer_pos.size; 02741 search_state.test_pos.search_direction = 1; 02742 search_state.counter_stack = 0; 02743 search_state.backtrack_stack = 0; 02744 search_state.backtrack_frame_bytes = 02745 (sizeof (struct rx_backtrack_frame) 02746 + (rxb->match_regs_on_stack 02747 ? sizeof (regoff_t) * (search_state.num_regs + 1) * 2 02748 : 0)); 02749 search_state.chunk_bytes = search_state.backtrack_frame_bytes * 64; 02750 search_state.test_ret = rx_test_line_finished; 02751 search_state.could_have_continued = 0; 02752 } 02753 /* This is while (1)...except that the body of the loop is interrupted 02754 * by some alternative entry points. 02755 */ 02756 pseudo_while_1: 02757 switch (test_pc) 02758 { 02759 case rx_test_cache_hit_loop: 02760 goto resume_continuation_1; 02761 case rx_test_backreference_check: 02762 goto resume_continuation_2; 02763 case rx_test_backtrack_return: 02764 goto resume_continuation_3; 02765 case rx_test_start: 02766 #ifdef RX_DEBUG 02767 /* There is a search tree with every node as set of deterministic 02768 * transitions in the super nfa. For every branch of a 02769 * backtrack point is an edge in the tree. 02770 * This counts up a pre-order of nodes in that tree. 02771 * It's saved on the search stack and printed when debugging. 02772 */ 02773 search_state.line_no = 0; 02774 search_state.lines_found = 0; 02775 #endif 02776 02777 top_of_cycle: 02778 /* A superstate is basicly a transition table, indexed by 02779 * characters from the string being tested, and containing 02780 * RX_INX (`instruction frame') structures. 02781 */ 02782 search_state.ifr = &search_state.super->transitions [search_state.c]; 02783 02784 recurse_test_match: 02785 /* This is the point to which control is sent when the 02786 * test matcher `recurses'. Before jumping here, some variables 02787 * need to be saved on the stack and the next instruction frame 02788 * has to be computed. 02789 */ 02790 02791 restart: 02792 /* Some instructions don't advance the matcher, but just 02793 * carry out some side effects and fetch a new instruction. 02794 * To dispatch that new instruction, `goto restart'. 02795 */ 02796 02797 { 02798 struct rx_inx * next_tr_table; 02799 struct rx_inx * this_tr_table; 02800 /* The fastest route through the loop is when the instruction 02801 * is RX_NEXT_CHAR. This case is detected when SEARCH_STATE.IFR->DATA 02802 * is non-zero. In that case, it points to the next 02803 * superstate. 02804 * 02805 * This allows us to not bother fetching the bytecode. 02806 */ 02807 next_tr_table = (struct rx_inx *)search_state.ifr->data; 02808 this_tr_table = search_state.super->transitions; 02809 while (next_tr_table) 02810 { 02811 #ifdef RX_DEBUG_0 02812 if (rx_debug_trace) 02813 { 02814 struct rx_superset * setp; 02815 02816 fprintf (stderr, "%d %d>> re_next_char @ %d (%d)", 02817 search_state.line_no, 02818 search_state.backtrack_depth, 02819 (search_state.test_pos.pos - search_state.test_pos.string 02820 + search_state.test_pos.offset), search_state.c); 02821 02822 search_state.super = 02823 ((struct rx_superstate *) 02824 ((char *)this_tr_table 02825 - ((unsigned long) 02826 ((struct rx_superstate *)0)->transitions))); 02827 02828 setp = search_state.super->contents; 02829 fprintf (stderr, " superstet (rx=%d, &=%x: ", 02830 rxb->rx.rx_id, setp); 02831 while (setp) 02832 { 02833 fprintf (stderr, "%d ", setp->id); 02834 setp = setp->cdr; 02835 } 02836 fprintf (stderr, "\n"); 02837 } 02838 #endif 02839 this_tr_table = next_tr_table; 02840 ++search_state.test_pos.pos; 02841 if (search_state.test_pos.pos == search_state.test_pos.end) 02842 { 02843 int burst_state; 02844 try_burst_1: 02845 burst_state = get_burst (&search_state.test_pos, 02846 app_closure, stop); 02847 switch (burst_state) 02848 { 02849 case rx_get_burst_continuation: 02850 search_state.saved_this_tr_table = this_tr_table; 02851 search_state.saved_next_tr_table = next_tr_table; 02852 test_pc = rx_test_cache_hit_loop; 02853 goto test_return_continuation; 02854 02855 resume_continuation_1: 02856 /* Continuation one jumps here to do its work: */ 02857 search_state.saved_this_tr_table = this_tr_table; 02858 search_state.saved_next_tr_table = next_tr_table; 02859 goto try_burst_1; 02860 02861 case rx_get_burst_ok: 02862 /* get_burst succeeded...keep going */ 02863 break; 02864 02865 case rx_get_burst_no_more: 02866 search_state.test_ret = rx_test_line_finished; 02867 search_state.could_have_continued = 1; 02868 goto test_do_return; 02869 02870 case rx_get_burst_error: 02871 /* An error... */ 02872 search_state.test_ret = rx_test_internal_error; 02873 goto test_do_return; 02874 } 02875 } 02876 search_state.c = *search_state.test_pos.pos; 02877 search_state.ifr = this_tr_table + search_state.c; 02878 next_tr_table = (struct rx_inx *)search_state.ifr->data; 02879 } /* Fast loop through cached transition tables */ 02880 02881 /* Here when we ran out of cached next-char transitions. 02882 * So, it will be necessary to do a more expensive 02883 * dispatch on the current instruction. The superstate 02884 * pointer is allowed to become invalid during next-char 02885 * transitions -- now we must bring it up to date. 02886 */ 02887 search_state.super = 02888 ((struct rx_superstate *) 02889 ((char *)this_tr_table 02890 - ((unsigned long) 02891 ((struct rx_superstate *)0)->transitions))); 02892 } 02893 02894 /* We've encountered an instruction other than next-char. 02895 * Dispatch that instruction: 02896 */ 02897 inx = (int)search_state.ifr->inx; 02898 #ifdef RX_DEBUG_0 02899 if (rx_debug_trace) 02900 { 02901 struct rx_superset * setp = search_state.super->contents; 02902 02903 fprintf (stderr, "%d %d>> %s @ %d (%d)", search_state.line_no, 02904 search_state.backtrack_depth, 02905 inx_names[inx], 02906 (search_state.test_pos.pos - search_state.test_pos.string 02907 + (test_pos.half == 0 ? 0 : size1)), search_state.c); 02908 02909 fprintf (stderr, " superstet (rx=%d, &=%x: ", 02910 rxb->rx.rx_id, setp); 02911 while (setp) 02912 { 02913 fprintf (stderr, "%d ", setp->id); 02914 setp = setp->cdr; 02915 } 02916 fprintf (stderr, "\n"); 02917 } 02918 #endif 02919 switch ((enum rx_opcode)inx) 02920 { 02921 case rx_do_side_effects: 02922 02923 /* RX_DO_SIDE_EFFECTS occurs when we cross epsilon 02924 * edges associated with parentheses, backreferencing, etc. 02925 */ 02926 { 02927 struct rx_distinct_future * df = 02928 (struct rx_distinct_future *)search_state.ifr->data_2; 02929 struct rx_se_list * el = df->effects; 02930 /* Side effects come in lists. This walks down 02931 * a list, dispatching. 02932 */ 02933 while (el) 02934 { 02935 long effect; 02936 effect = (long)el->car; 02937 if (effect < 0) 02938 { 02939 #ifdef RX_DEBUG_0 02940 if (rx_debug_trace) 02941 { 02942 struct rx_superset * setp = search_state.super->contents; 02943 02944 fprintf (stderr, "....%d %d>> %s\n", search_state.line_no, 02945 search_state.backtrack_depth, 02946 efnames[-effect]); 02947 } 02948 #endif 02949 switch ((enum re_side_effects) effect) 02950 02951 { 02952 case re_se_pushback: 02953 search_state.ifr = &df->future_frame; 02954 if (!search_state.ifr->data) 02955 { 02956 struct rx_superstate * sup; 02957 sup = search_state.super; 02958 rx_lock_superstate (rx, sup); 02959 if (!rx_handle_cache_miss (&rxb->rx, 02960 search_state.super, 02961 search_state.c, 02962 (search_state.ifr 02963 ->data_2))) 02964 { 02965 rx_unlock_superstate (rx, sup); 02966 search_state.test_ret = rx_test_internal_error; 02967 goto test_do_return; 02968 } 02969 rx_unlock_superstate (rx, sup); 02970 } 02971 /* --search_state.test_pos.pos; */ 02972 search_state.c = 't'; 02973 search_state.super 02974 = ((struct rx_superstate *) 02975 ((char *)search_state.ifr->data 02976 - (long)(((struct rx_superstate *)0) 02977 ->transitions))); 02978 goto top_of_cycle; 02979 break; 02980 case re_se_push0: 02981 { 02982 struct rx_counter_frame * old_cf 02983 = (search_state.counter_stack 02984 ? ((struct rx_counter_frame *) 02985 search_state.counter_stack->sp) 02986 : 0); 02987 struct rx_counter_frame * cf; 02988 PUSH (search_state.counter_stack, 02989 sizeof (struct rx_counter_frame)); 02990 cf = ((struct rx_counter_frame *) 02991 search_state.counter_stack->sp); 02992 cf->tag = re_se_iter; 02993 cf->val = 0; 02994 cf->inherited_from = 0; 02995 cf->cdr = old_cf; 02996 break; 02997 } 02998 case re_se_fail: 02999 goto test_do_return; 03000 case re_se_begbuf: 03001 if (!AT_STRINGS_BEG ()) 03002 goto test_do_return; 03003 break; 03004 case re_se_endbuf: 03005 if (!AT_STRINGS_END ()) 03006 goto test_do_return; 03007 break; 03008 case re_se_wordbeg: 03009 if ( LETTER_P (&search_state.test_pos, 1) 03010 && ( AT_STRINGS_BEG() 03011 || !LETTER_P (&search_state.test_pos, 0))) 03012 break; 03013 else 03014 goto test_do_return; 03015 case re_se_wordend: 03016 if ( !AT_STRINGS_BEG () 03017 && LETTER_P (&search_state.test_pos, 0) 03018 && (AT_STRINGS_END () 03019 || !LETTER_P (&search_state.test_pos, 1))) 03020 break; 03021 else 03022 goto test_do_return; 03023 case re_se_wordbound: 03024 if (AT_WORD_BOUNDARY (&search_state.test_pos)) 03025 break; 03026 else 03027 goto test_do_return; 03028 case re_se_notwordbound: 03029 if (!AT_WORD_BOUNDARY (&search_state.test_pos)) 03030 break; 03031 else 03032 goto test_do_return; 03033 case re_se_hat: 03034 if (AT_STRINGS_BEG ()) 03035 { 03036 if (rxb->not_bol) 03037 goto test_do_return; 03038 else 03039 break; 03040 } 03041 else 03042 { 03043 char pos_c = *search_state.test_pos.pos; 03044 if ( (SRCH_TRANSLATE (pos_c) 03045 == SRCH_TRANSLATE('\n')) 03046 && rxb->newline_anchor) 03047 break; 03048 else 03049 goto test_do_return; 03050 } 03051 case re_se_dollar: 03052 if (AT_STRINGS_END ()) 03053 { 03054 if (rxb->not_eol) 03055 goto test_do_return; 03056 else 03057 break; 03058 } 03059 else 03060 { 03061 if ( ( SRCH_TRANSLATE (fetch_char 03062 (&search_state.test_pos, 1, 03063 app_closure, stop)) 03064 == SRCH_TRANSLATE ('\n')) 03065 && rxb->newline_anchor) 03066 break; 03067 else 03068 goto test_do_return; 03069 } 03070 03071 case re_se_try: 03072 /* This is the first side effect in every 03073 * expression. 03074 * 03075 * FOR NO GOOD REASON...get rid of it... 03076 */ 03077 break; 03078 03079 case re_se_pushpos: 03080 { 03081 int urhere = 03082 ((int)(search_state.test_pos.pos 03083 - search_state.test_pos.string) 03084 + search_state.test_pos.offset); 03085 struct rx_counter_frame * old_cf 03086 = (search_state.counter_stack 03087 ? ((struct rx_counter_frame *) 03088 search_state.counter_stack->sp) 03089 : 0); 03090 struct rx_counter_frame * cf; 03091 PUSH(search_state.counter_stack, 03092 sizeof (struct rx_counter_frame)); 03093 cf = ((struct rx_counter_frame *) 03094 search_state.counter_stack->sp); 03095 cf->tag = re_se_pushpos; 03096 cf->val = urhere; 03097 cf->inherited_from = 0; 03098 cf->cdr = old_cf; 03099 break; 03100 } 03101 03102 case re_se_chkpos: 03103 { 03104 int urhere = 03105 ((int)(search_state.test_pos.pos 03106 - search_state.test_pos.string) 03107 + search_state.test_pos.offset); 03108 struct rx_counter_frame * cf 03109 = ((struct rx_counter_frame *) 03110 search_state.counter_stack->sp); 03111 if (cf->val == urhere) 03112 goto test_do_return; 03113 cf->val = urhere; 03114 break; 03115 } 03116 break; 03117 03118 case re_se_poppos: 03119 POP(search_state.counter_stack, 03120 sizeof (struct rx_counter_frame)); 03121 break; 03122 03123 03124 case re_se_at_dot: 03125 case re_se_syntax: 03126 case re_se_not_syntax: 03127 #ifdef emacs 03128 /* 03129 * this release lacks emacs support 03130 */ 03131 #endif 03132 break; 03133 case re_se_win: 03134 case re_se_lparen: 03135 case re_se_rparen: 03136 case re_se_backref: 03137 case re_se_iter: 03138 case re_se_end_iter: 03139 case re_se_tv: 03140 case re_floogle_flap: 03141 search_state.ret_val = 0; 03142 goto test_do_return; 03143 } 03144 } 03145 else 03146 { 03147 #ifdef RX_DEBUG_0 03148 if (rx_debug_trace) 03149 fprintf (stderr, "....%d %d>> %s %d %d\n", search_state.line_no, 03150 search_state.backtrack_depth, 03151 efnames2[rxb->se_params [effect].se], 03152 rxb->se_params [effect].op1, 03153 rxb->se_params [effect].op2); 03154 #endif 03155 switch (rxb->se_params [effect].se) 03156 { 03157 case re_se_win: 03158 /* This side effect indicates that we've 03159 * found a match, though not necessarily the 03160 * best match. This is a fancy assignment to 03161 * register 0 unless the caller didn't 03162 * care about registers. In which case, 03163 * this stops the match. 03164 */ 03165 { 03166 int urhere = 03167 ((int)(search_state.test_pos.pos 03168 - search_state.test_pos.string) 03169 + search_state.test_pos.offset); 03170 03171 if ( (search_state.best_last_r < 0) 03172 || (urhere + 1 > search_state.best_rparen[0])) 03173 { 03174 /* Record the best known and keep 03175 * looking. 03176 */ 03177 int x; 03178 for (x = 0; x <= search_state.last_l; ++x) 03179 search_state.best_lparen[x] = search_state.lparen[x]; 03180 search_state.best_last_l = search_state.last_l; 03181 for (x = 0; x <= search_state.last_r; ++x) 03182 search_state.best_rparen[x] = search_state.rparen[x]; 03183 search_state.best_rparen[0] = urhere + 1; 03184 search_state.best_last_r = search_state.last_r; 03185 } 03186 /* If we're not reporting the match-length 03187 * or other register info, we need look no 03188 * further. 03189 */ 03190 if (search_state.first_found) 03191 { 03192 search_state.test_ret = rx_test_found_first; 03193 goto test_do_return; 03194 } 03195 } 03196 break; 03197 case re_se_lparen: 03198 { 03199 int urhere = 03200 ((int)(search_state.test_pos.pos 03201 - search_state.test_pos.string) 03202 + search_state.test_pos.offset); 03203 03204 int reg = rxb->se_params [effect].op1; 03205 #if 0 03206 if (reg > search_state.last_l) 03207 #endif 03208 { 03209 search_state.lparen[reg] = urhere + 1; 03210 /* In addition to making this assignment, 03211 * we now know that lower numbered regs 03212 * that haven't already been assigned, 03213 * won't be. We make sure they're 03214 * filled with -1, so they can be 03215 * recognized as unassigned. 03216 */ 03217 if (search_state.last_l < reg) 03218 while (++search_state.last_l < reg) 03219 search_state.lparen[search_state.last_l] = -1; 03220 } 03221 break; 03222 } 03223 03224 case re_se_rparen: 03225 { 03226 int urhere = 03227 ((int)(search_state.test_pos.pos 03228 - search_state.test_pos.string) 03229 + search_state.test_pos.offset); 03230 int reg = rxb->se_params [effect].op1; 03231 search_state.rparen[reg] = urhere + 1; 03232 if (search_state.last_r < reg) 03233 { 03234 while (++search_state.last_r < reg) 03235 search_state.rparen[search_state.last_r] 03236 = -1; 03237 } 03238 break; 03239 } 03240 03241 case re_se_backref: 03242 { 03243 int reg = rxb->se_params [effect].op1; 03244 if ( reg > search_state.last_r 03245 || search_state.rparen[reg] < 0) 03246 goto test_do_return; 03247 03248 { 03249 int backref_status; 03250 check_backreference: 03251 backref_status 03252 = back_check (&search_state.test_pos, 03253 search_state.lparen[reg], 03254 search_state.rparen[reg], 03255 search_state.translate, 03256 app_closure, 03257 stop); 03258 switch (backref_status) 03259 { 03260 case rx_back_check_continuation: 03261 search_state.saved_reg = reg; 03262 test_pc = rx_test_backreference_check; 03263 goto test_return_continuation; 03264 resume_continuation_2: 03265 reg = search_state.saved_reg; 03266 goto check_backreference; 03267 case rx_back_check_fail: 03268 /* Fail */ 03269 goto test_do_return; 03270 case rx_back_check_pass: 03271 /* pass -- 03272 * test_pos now advanced to last 03273 * char matched by backref 03274 */ 03275 break; 03276 } 03277 } 03278 break; 03279 } 03280 case re_se_iter: 03281 { 03282 struct rx_counter_frame * csp 03283 = ((struct rx_counter_frame *) 03284 search_state.counter_stack->sp); 03285 if (csp->val == rxb->se_params[effect].op2) 03286 goto test_do_return; 03287 else 03288 ++csp->val; 03289 break; 03290 } 03291 case re_se_end_iter: 03292 { 03293 struct rx_counter_frame * csp 03294 = ((struct rx_counter_frame *) 03295 search_state.counter_stack->sp); 03296 if (csp->val < rxb->se_params[effect].op1) 03297 goto test_do_return; 03298 else 03299 { 03300 struct rx_counter_frame * source = csp; 03301 while (source->inherited_from) 03302 source = source->inherited_from; 03303 if (!source || !source->cdr) 03304 { 03305 POP(search_state.counter_stack, 03306 sizeof(struct rx_counter_frame)); 03307 } 03308 else 03309 { 03310 source = source->cdr; 03311 csp->val = source->val; 03312 csp->tag = source->tag; 03313 csp->cdr = 0; 03314 csp->inherited_from = source; 03315 } 03316 } 03317 break; 03318 } 03319 case re_se_tv: 03320 /* is a noop */ 03321 break; 03322 case re_se_try: 03323 case re_se_pushback: 03324 case re_se_push0: 03325 case re_se_pushpos: 03326 case re_se_chkpos: 03327 case re_se_poppos: 03328 case re_se_at_dot: 03329 case re_se_syntax: 03330 case re_se_not_syntax: 03331 case re_se_begbuf: 03332 case re_se_hat: 03333 case re_se_wordbeg: 03334 case re_se_wordbound: 03335 case re_se_notwordbound: 03336 case re_se_wordend: 03337 case re_se_endbuf: 03338 case re_se_dollar: 03339 case re_se_fail: 03340 case re_floogle_flap: 03341 search_state.ret_val = 0; 03342 goto test_do_return; 03343 } 03344 } 03345 el = el->cdr; 03346 } 03347 /* Now the side effects are done, 03348 * so get the next instruction. 03349 * and move on. 03350 */ 03351 search_state.ifr = &df->future_frame; 03352 goto restart; 03353 } 03354 03355 case rx_backtrack_point: 03356 { 03357 /* A backtrack point indicates that we've reached a 03358 * non-determinism in the superstate NFA. This is a 03359 * loop that exhaustively searches the possibilities. 03360 * 03361 * A backtracking strategy is used. We keep track of what 03362 * registers are valid so we can erase side effects. 03363 * 03364 * First, make sure there is some stack space to hold 03365 * our state. 03366 */ 03367 03368 struct rx_backtrack_frame * bf; 03369 03370 PUSH(search_state.backtrack_stack, 03371 search_state.backtrack_frame_bytes); 03372 #ifdef RX_DEBUG_0 03373 ++search_state.backtrack_depth; 03374 #endif 03375 03376 bf = ((struct rx_backtrack_frame *) 03377 search_state.backtrack_stack->sp); 03378 { 03379 bf->stk_super = search_state.super; 03380 /* We prevent the current superstate from being 03381 * deleted from the superstate cache. 03382 */ 03383 rx_lock_superstate (&rxb->rx, search_state.super); 03384 #ifdef RX_DEBUG_0 03385 bf->stk_search_state.line_no = search_state.line_no; 03386 #endif 03387 bf->stk_c = search_state.c; 03388 bf->stk_test_pos = search_state.test_pos; 03389 bf->stk_last_l = search_state.last_l; 03390 bf->stk_last_r = search_state.last_r; 03391 bf->df = ((struct rx_super_edge *) 03392 search_state.ifr->data_2)->options; 03393 bf->first_df = bf->df; 03394 bf->counter_stack_sp = (search_state.counter_stack 03395 ? search_state.counter_stack->sp 03396 : 0); 03397 bf->stk_test_ret = search_state.test_ret; 03398 if (rxb->match_regs_on_stack) 03399 { 03400 int x; 03401 regoff_t * stk = 03402 (regoff_t *)((char *)bf + sizeof (*bf)); 03403 for (x = 0; x <= search_state.last_l; ++x) 03404 stk[x] = search_state.lparen[x]; 03405 stk += x; 03406 for (x = 0; x <= search_state.last_r; ++x) 03407 stk[x] = search_state.rparen[x]; 03408 } 03409 } 03410 03411 /* Here is a while loop whose body is mainly a function 03412 * call and some code to handle a return from that 03413 * function. 03414 * 03415 * From here on for the rest of `case backtrack_point' it 03416 * is unsafe to assume that the search_state copies of 03417 * variables saved on the backtracking stack are valid 03418 * -- so read their values from the backtracking stack. 03419 * 03420 * This lets us use one generation fewer stack saves in 03421 * the call-graph of a search. 03422 */ 03423 03424 while_non_det_options: 03425 #ifdef RX_DEBUG_0 03426 ++search_state.lines_found; 03427 if (rx_debug_trace) 03428 fprintf (stderr, "@@@ %d calls %d @@@\n", 03429 search_state.line_no, search_state.lines_found); 03430 03431 search_state.line_no = search_state.lines_found; 03432 #endif 03433 03434 if (bf->df->next_same_super_edge[0] == bf->first_df) 03435 { 03436 /* This is a tail-call optimization -- we don't recurse 03437 * for the last of the possible futures. 03438 */ 03439 search_state.ifr = (bf->df->effects 03440 ? &bf->df->side_effects_frame 03441 : &bf->df->future_frame); 03442 03443 rx_unlock_superstate (&rxb->rx, search_state.super); 03444 POP(search_state.backtrack_stack, 03445 search_state.backtrack_frame_bytes); 03446 #ifdef RX_DEBUG 03447 --search_state.backtrack_depth; 03448 #endif 03449 goto restart; 03450 } 03451 else 03452 { 03453 if (search_state.counter_stack) 03454 { 03455 struct rx_counter_frame * old_cf 03456 = ((struct rx_counter_frame *)search_state.counter_stack->sp); 03457 struct rx_counter_frame * cf; 03458 PUSH(search_state.counter_stack, sizeof (struct rx_counter_frame)); 03459 cf = ((struct rx_counter_frame *)search_state.counter_stack->sp); 03460 cf->tag = old_cf->tag; 03461 cf->val = old_cf->val; 03462 cf->inherited_from = old_cf; 03463 cf->cdr = 0; 03464 } 03465 /* `Call' this test-match block */ 03466 search_state.ifr = (bf->df->effects 03467 ? &bf->df->side_effects_frame 03468 : &bf->df->future_frame); 03469 goto recurse_test_match; 03470 } 03471 03472 /* Returns in this block are accomplished by 03473 * goto test_do_return. There are two cases. 03474 * If there is some search-stack left, 03475 * then it is a return from a `recursive' call. 03476 * If there is no search-stack left, then 03477 * we should return to the fastmap/search loop. 03478 */ 03479 03480 test_do_return: 03481 03482 if (!search_state.backtrack_stack) 03483 { 03484 #ifdef RX_DEBUG_0 03485 if (rx_debug_trace) 03486 fprintf (stderr, "!!! %d bails returning %d !!!\n", 03487 search_state.line_no, search_state.test_ret); 03488 #endif 03489 03490 /* No more search-stack -- this test is done. */ 03491 if (search_state.test_ret != rx_test_internal_error) 03492 goto return_from_test_match; 03493 else 03494 goto error_in_testing_match; 03495 } 03496 03497 /* Returning from a recursive call to 03498 * the test match block: 03499 */ 03500 03501 bf = ((struct rx_backtrack_frame *) 03502 search_state.backtrack_stack->sp); 03503 #ifdef RX_DEBUG_0 03504 if (rx_debug_trace) 03505 fprintf (stderr, "+++ %d returns %d (to %d)+++\n", 03506 search_state.line_no, 03507 search_state.test_ret, 03508 bf->stk_search_state.line_no); 03509 #endif 03510 03511 while (search_state.counter_stack 03512 && (!bf->counter_stack_sp 03513 || (bf->counter_stack_sp 03514 != search_state.counter_stack->sp))) 03515 { 03516 POP(search_state.counter_stack, 03517 sizeof (struct rx_counter_frame)); 03518 } 03519 03520 if (search_state.test_ret == rx_test_internal_error) 03521 { 03522 POP (search_state.backtrack_stack, 03523 search_state.backtrack_frame_bytes); 03524 search_state.test_ret = rx_test_internal_error; 03525 goto test_do_return; 03526 } 03527 03528 /* If a non-longest match was found and that is good 03529 * enough, return immediately. 03530 */ 03531 if ( (search_state.test_ret == rx_test_found_first) 03532 && search_state.first_found) 03533 { 03534 rx_unlock_superstate (&rxb->rx, bf->stk_super); 03535 POP (search_state.backtrack_stack, 03536 search_state.backtrack_frame_bytes); 03537 goto test_do_return; 03538 } 03539 03540 search_state.test_ret = bf->stk_test_ret; 03541 search_state.last_l = bf->stk_last_l; 03542 search_state.last_r = bf->stk_last_r; 03543 bf->df = bf->df->next_same_super_edge[0]; 03544 search_state.super = bf->stk_super; 03545 search_state.c = bf->stk_c; 03546 #ifdef RX_DEBUG_0 03547 search_state.line_no = bf->stk_search_state.line_no; 03548 #endif 03549 03550 if (rxb->match_regs_on_stack) 03551 { 03552 int x; 03553 regoff_t * stk = 03554 (regoff_t *)((char *)bf + sizeof (*bf)); 03555 for (x = 0; x <= search_state.last_l; ++x) 03556 search_state.lparen[x] = stk[x]; 03557 stk += x; 03558 for (x = 0; x <= search_state.last_r; ++x) 03559 search_state.rparen[x] = stk[x]; 03560 } 03561 03562 { 03563 int x; 03564 try_burst_2: 03565 x = get_burst (&bf->stk_test_pos, app_closure, stop); 03566 switch (x) 03567 { 03568 case rx_get_burst_continuation: 03569 search_state.saved_bf = bf; 03570 test_pc = rx_test_backtrack_return; 03571 goto test_return_continuation; 03572 resume_continuation_3: 03573 bf = search_state.saved_bf; 03574 goto try_burst_2; 03575 case rx_get_burst_no_more: 03576 /* Since we've been here before, it is some kind of 03577 * error that we can't return. 03578 */ 03579 case rx_get_burst_error: 03580 search_state.test_ret = rx_test_internal_error; 03581 goto test_do_return; 03582 case rx_get_burst_ok: 03583 break; 03584 } 03585 } 03586 search_state.test_pos = bf->stk_test_pos; 03587 goto while_non_det_options; 03588 } 03589 03590 03591 case rx_cache_miss: 03592 /* Because the superstate NFA is lazily constructed, 03593 * and in fact may erode from underneath us, we sometimes 03594 * have to construct the next instruction from the hard way. 03595 * This invokes one step in the lazy-conversion. 03596 */ 03597 search_state.ifr = rx_handle_cache_miss (&rxb->rx, 03598 search_state.super, 03599 search_state.c, 03600 search_state.ifr->data_2); 03601 if (!search_state.ifr) 03602 { 03603 search_state.test_ret = rx_test_internal_error; 03604 goto test_do_return; 03605 } 03606 goto restart; 03607 03608 case rx_backtrack: 03609 /* RX_BACKTRACK means that we've reached the empty 03610 * superstate, indicating that match can't succeed 03611 * from this point. 03612 */ 03613 goto test_do_return; 03614 03615 case rx_next_char: 03616 case rx_error_inx: 03617 case rx_num_instructions: 03618 search_state.ret_val = 0; 03619 goto test_do_return; 03620 } 03621 goto pseudo_while_1; 03622 } 03623 03624 /* Healthy exits from the test-match loop do a 03625 * `goto return_from_test_match' On the other hand, 03626 * we might end up here. 03627 */ 03628 error_in_testing_match: 03629 test_state = rx_test_error; 03630 goto test_returns_to_search; 03631 03632 /***** fastmap/search loop body 03633 * considering the results testing for a match 03634 */ 03635 03636 return_from_test_match: 03637 03638 if (search_state.best_last_l >= 0) 03639 { 03640 if (regs && (regs->start != search_state.best_lparen)) 03641 { 03642 bcopy (search_state.best_lparen, regs->start, 03643 regs->num_regs * sizeof (int)); 03644 bcopy (search_state.best_rparen, regs->end, 03645 regs->num_regs * sizeof (int)); 03646 } 03647 if (regs && !rxb->no_sub) 03648 { 03649 int q; 03650 int bound = (regs->num_regs > search_state.num_regs 03651 ? regs->num_regs 03652 : search_state.num_regs); 03653 regoff_t * s = regs->start; 03654 regoff_t * e = regs->end; 03655 for (q = search_state.best_last_l + 1; q < bound; ++q) 03656 s[q] = e[q] = -1; 03657 } 03658 search_state.ret_val = search_state.best_lparen[0]; 03659 test_state = rx_test_ok; 03660 goto test_returns_to_search; 03661 } 03662 else 03663 { 03664 test_state = rx_test_fail; 03665 goto test_returns_to_search; 03666 } 03667 03668 test_return_continuation: 03669 search_state.test_match_resume_pt = test_pc; 03670 test_state = rx_test_continuation; 03671 goto test_returns_to_search; 03672 } 03673 } 03674 03675 03676 03677 #endif /* RX_WANT_RX_DEFS */ 03678 03679 03680 03681 #else /* RX_WANT_SE_DEFS */ 03682 /* Integers are used to represent side effects. 03683 * 03684 * Simple side effects are given negative integer names by these enums. 03685 * 03686 * Non-negative names are reserved for complex effects. 03687 * 03688 * Complex effects are those that take arguments. For example, 03689 * a register assignment associated with a group is complex because 03690 * it requires an argument to tell which group is being matched. 03691 * 03692 * The integer name of a complex effect is an index into rxb->se_params. 03693 */ 03694 03695 RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */ 03696 03697 RX_DEF_SE(0, re_se_pushback, = re_se_try - 1) 03698 RX_DEF_SE(0, re_se_push0, = re_se_pushback -1) 03699 RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1) 03700 RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1) 03701 RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1) 03702 03703 RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */ 03704 RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */ 03705 RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */ 03706 03707 RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */ 03708 RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */ 03709 03710 RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1) 03711 RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1) 03712 RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1) 03713 03714 RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1) 03715 RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1) 03716 03717 /* This fails except at the end of a line. 03718 * It deserves to go here since it is typicly one of the last steps 03719 * in a match. 03720 */ 03721 RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1) 03722 03723 /* Simple effects: */ 03724 RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1) 03725 03726 /* Complex effects. These are used in the 'se' field of 03727 * a struct re_se_params. Indexes into the se array 03728 * are stored as instructions on nfa edges. 03729 */ 03730 RX_DEF_CPLX_SE(1, re_se_win, = 0) 03731 RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1) 03732 RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1) 03733 RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1) 03734 RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1) 03735 RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1) 03736 RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1) 03737 03738 #endif 03739 03740 #endif

Generated on Sun Mar 6 13:35:49 2005 for Komssys by doxygen 1.3.8