#PERL TEXT SEARCH ENGINE, BY JIM TURNER # #WHEN REQUIRED INTO A PERL SCRIPT, THIS SEARCH ENGINE PROVIDES TWO FUNCTIONS #TO ASSIST WITH TEXT SEARCHES. # #THE FIRST FUNCTION (FIXSRCHPAT) ACCEPTS AN ENGLISH SEARCH EXPRESSION AND #RETURNS AN EQUIVALENT PERL REGULAR EXPRESSION WHICH CAN PERFORM THE SEARCH #ON A SPECIFIC TEXT STRING. # #THE 2ND FUNCTION (QUOTEWORDS) IS CALLED BY THE FIXSRCHPAT TO BREAK UP THE #ENGLISH SEARCH EXPRESSION INTO TOKENS AND OPERATORS. IT IS ALSO USER-CALLABLE. # #AN ENGLISH SEARCH EXPRESSION TAKES THE FORM: # # expn ::= single_expn | ( expn ) # single_expn = pattern | single_expn expn boolop pattern # boolop ::= and | or | AND | OR | && | & | || | | # pattern ::= token | unop token # unop ::= not | NOT | ~ | ! # token ::= 'regexp'regops* | quotedexpn | words # regops ::= e | g | i | m | o | s | x # quotedexpn ::= "words" # words ::= word | words word # word ::= {any string of characters except boolops and unescaped special # characters. # # example: 'unix and ~"C" or (o\'tools & '\bcobol\b'io) || three blind mice' # #TO CALL FIXSRCHPAT, PASS IT 3 STRINGS: THE 1ST REPRESENTING A PERL #VARIABLE NAME TO CONTAIN THE TEXT TO BE SEARCHED, 2ND, THE SEARCH #EXPRESSION, AS DESCRIBED ABOVE, AND 3RD (OPTIONAL), 'i' IF CASE IS TO BE #IGNORED, IE: # # $perlregexp = &fixsrchpat('mytext','Unix and ~"C"','i'); # $mytext = 'This has both unix but not letter CEE, so it should match!'; # # if (eval "$perlregexp") { # print "--matched!\n"; # } else { # print "--did not match.\n"; # } # sub fixsrchpat #CONVERT USER'S SEARCH PATTERNS INTO PERL REGULAR EXPRESSION. { my ($ct) = shift; my ($text) = shift; my ($nocase) = shift; my ($ops) = shift; my ($acc,$openquote,$beforeq,$quotedtkn,$afterq,$rexpnops,@new,@new2); $nocase =~ tr/A-Z/a-z/; $_ = $text; s/\\\'|\'\'/\x7e/g; #PROTECT "", \", '', AND \'. s/\\\"|\"\"/\x7f/g; $text = ''; while (/([\'\"])(.*?)\1([egimosxEGIMOSX]*)/) #ONLY PAD ( AND ) OUTSIDE OF QUOTES. { $openquote = $1; #OPENING QUOTE (SINGLE OR DOUBLE) $beforeq = $`; #STUFF BEFORE OPENING QUOTE $quotedtkn = $2; #STUFF BETWEEN THE QUOTES (QUOTED TOKEN) $afterq = $'; #STUFF AFTER CLOSING QUOTE $rexpnops = $3; #REGULAR EXPN OPTIONS ('...'iog) $rexpnops = $nocase . $rexpnops unless ($openquote eq '"' or $rexpnops =~ 'i'); #FIX REG. EXPNS. $rexpnops =~ tr/A-Z/a-z/; $beforeq =~ s/\(/ \( /g; #PAD ( AND ) TO FORCE THEM INTO SEPARATE WORDS. $beforeq =~ s/\)/ \) /g; $beforeq =~ s/\&\&?/ and /g; #treat "&" or "&&" as " and ". $beforeq =~ s/\|\|?/ or /g; $beforeq =~ s/[\~\!]/ not /g; $text .= $beforeq . $openquote . $quotedtkn . $openquote . $rexpnops; $_ = $afterq; } s/\(/ \( /g; #FINISH REMAINDER OF STRING AFTER LAST QUOTE PAIR. s/\)/ \) /g; s/\&\&?/ and /g; #treat "&" or "&&" as " and ". s/\|\|?/ or /g; s/[\~\!]/ not /g; $text .= $_; @new = quotewords(" ", 1, $text); #BREAK UP INTO WORDS (TOKENS). foreach (@new) { s/\x7e/\'/g; #RESTORE PROTECTED SINGLE QUOTES HERE. if ($_ =~ /^(and|or|not|\(|\))$/i) #WORD IS AN OPERATOR. { $acc =~ s/ $//; push (@new2,$acc) if ($acc gt ' '); $acc = ''; push (@new2,$_); } else #WORD IS A SEARCH PATTERN (OPERAND). { $acc .= $_ . ' '; } } $acc =~ s/ $//; push (@new2,$acc) if ($acc gt ' '); #GET REM. OPERAND AFTER LAST OP. $acc = ''; foreach (@new2) #CONVERT OPERATORS AND REGEXPNS TO CORRECT PERL RENDITION. { if ($_ =~ /^and$/i) { $acc .= ' && '; } elsif ($_ =~ /^or$/i) { $acc .= ' || '; } elsif ($_ =~ /^not$/i) { $acc .= ' !'; } elsif ($_ eq '(') { $acc .= ' ( '; } elsif ($_ eq ')') { $acc .= ' ) '; } elsif (/^\'(.+)\'(.*)/) #FORMAT REG-EXPNS INTO PERL EXPS. ($CT IS A PERL VARIABLE). { $acc .= "(\$$ct =~ m%$1%$2)"; } else #FORMAT DOUBLE-QUOTED "WORDS" TO PERL REG-EXPN. { #print "-jsrchengn: cash=$_= acc=$acc=\n"; if ($ops =~ /w/) { s/\\\?/\x02/g; s/\\\*/\x03/g; s/([\+\.\-\(\)\[\]\{\}\|\\])/\\$1/g; s/\?/\./g; s/\*/\\S\*\?/g; s/\x03/\\\*/g; s/\x02/\\\?/g; } else { s/([\+\.\-\?\*\(\)\[\]\{\}\|\\])/\\$1/g; } s/\"/(\\b|\\s|\\W)/g; # s/\"/\\b/g; $acc .= "(\$$ct =~ m%$_%${nocase}o)"; } } $acc =~ s/\x7f/\\\"/g; #RESTORE PROTECTED DOUBLE QUOTES HERE. return ($acc); #PERL-FORMATTED SEARCH-EXPRESSION. } sub quotewords { #SPLIT UP USER'S SEARCH-EXPRESSION INTO "WORDS" (TOKENISE)! # THIS CODE WAS COPIED FROM THE PERL "TEXT" MODULE, (ParseWords.pm), # written by: Hal Pomeranz (pomeranz@netcom.com), 23 March 1994 # (Thanks, Hal!) # MODIFIED BY JIM TURNER (6/97) TO ALLOW ESCAPED (REGULAR-EXPRESSION) # CHARACTERS TO BE INCLUDED IN WORDS AND TO COMPRESS MULTIPLE OCCURRANCES # OF THE DELIMITER CHARACTER TO BE COMPRESSED INTO A SINGLE DELIMITER # (NO EMPTY WORDS). # # The inner "for" loop builds up each word (or $field) one $snippet # at a time. A $snippet is a quoted string, a backslashed character, # or an unquoted string. We fall out of the "for" loop when we reach # the end of $_ or when we hit a delimiter. Falling out of the "for" # loop, we push the $field we've been building up onto the list of # @words we'll be returning, and then loop back and pull another word # off of $_. # # The first two cases inside the "for" loop deal with quoted strings. # The first case matches a double quoted string, removes it from $_, # and assigns the double quoted string to $snippet in the body of the # conditional. The second case handles single quoted strings. In # the third case we've found a quote at the current beginning of $_, # but it didn't match the quoted string regexps in the first two cases, # so it must be an unbalanced quote and we croak with an error (which can # be caught by eval()). # # The next case handles backslashed characters, and the next case is the # exit case on reaching the end of the string or finding a delimiter. # # Otherwise, we've found an unquoted thing and we pull of characters one # at a time until we reach something that could start another $snippet-- # a quote of some sort, a backslash, or the delimiter. This one character # at a time behavior was necessary if the delimiter was going to be a # regexp (love to hear it if you can figure out a better way). my ($delim, $keep, @lines) = @_; my (@words,$snippet,$field,$q,@quotes); $_ = join('', @lines); while ($_) { $field = ''; for (;;) { $snippet = ''; @quotes = ('\'','"'); if (s/^(["'`])(.+?)\1//) { $snippet = $2; $snippet = "$1$snippet$1" if ($keep); } elsif (/^["']/) { print "Error: Unmatched quote near ($_)!
\n"; return (); } elsif (s/^\\(.)//) { $snippet = $1; $snippet = "\\$snippet" if ($keep); } elsif (!$_ || s/^$delim+//) { #REMOVE "+" TO REMOVE DELIMITER-COMPRESSION. last; } else { while ($_ && !(/^$delim/)) { #ATTEMPT TO HANDLE TWO QUOTES IN A ROW. last if (/^['"]/ && ($snippet !~ /\\$/)); $snippet .= substr($_, 0, 1); substr($_, 0, 1) = ''; } } $field .= $snippet; } push(@words, $field); } @words; } 1;