# (C) 2006 by Remo Dentato (rdentato@users.sourceforge.net) # # Permission to use, copy, modify and distribute this code and # its documentation for any purpose is hereby granted without fee, # provided that the above copyright notice, or equivalent attribution # acknowledgement, appears in all copies and supporting documentation. # # Copyright holder makes no representations about the suitability # of this software for any purpose. It is provided "as is" without # express or implied warranty. # * Introduction # This file is both a testsuite and part of the manual for yrx. # Every single matching operator is shown with examples. # Group of patterns are defined with a number and the pattern enclosed # in between two slashes ('/'). # Following strings (enclosed in double quotes) are to be matched against # the group of patterns. The number following the string indicates which # pattern the string matches (0 if none). # * Characters # Usually characters in a pattern match themselves. 1/plain string/ "plain string" 1 {plain string} {} {} {} {} {} {} {} {} "other string" 0 {} {} {} {} {} {} {} {} {} # * Character class # A character set is way to specify a set of characters at once. 1/[aeiou]/ # wovels 2/[bcdfghjklmnpqrstvwxyz]/ # consonants "u" 1 {u} {} {} {} {} {} {} {} {} "x" 2 {x} {} {} {} {} {} {} {} {} "3" 0 {} {} {} {} {} {} {} {} {} # A set may be specified with a range 1/[A-Ga-g]/ # Notes pitch "A" 1 {A} {} {} {} {} {} {} {} {} "b" 1 {b} {} {} {} {} {} {} {} {} "K" 0 {} {} {} {} {} {} {} {} {} # To specify characters that do not belong to a given set, the '^' symbol # is used: 1/[^A-Z]/ # any character that is not an uppercase letter "u" 1 {u} {} {} {} {} {} {} {} {} "X" 0 {} {} {} {} {} {} {} {} {} "3" 1 {3} {} {} {} {} {} {} {} {} # Also there are predefined classes: # \a alphabetic character # \b backspace character (ASCII 8) # \c a "control" character (ASCII 1 to ASCII 7) # \d decimal digit # \f form feed character (ASCII 10) # \h an hexadecimal digit ([0-9A-Za-z]) # \l lowercase character # \n newline character (ASCII 10) # \oxxx the character whose code is xxx (in octal) # \p a punctuation character # \q alphanumeric (a letter or a digit) # \r carriage return character (ASCII 13) # \s a spacing character (includes newlines, tabs, etc.) # \t tab character (ASCII 8) # \u uppercase character # \v vertical tab character (ASCII 10) # \w a "word" character ([\q_]) # \xhh the character whose code is hh (in hex) # \y space or tab (ASCII 32 or ASCII 9) 1/\a\l\u\d\q\s\p\h\w\y/ "PaR1S :F_ " 1 {PaR1S :F_ } {} {} {} {} {} {} {} {} "1Ag" 0 {} {} {} {} {} {} {} {} {} # Note that the predefined sets are influenced by the current locale # settings. This means that "\l" may be different from "[a-z]", for example # the accented e "è" is considered a lowercase letter if locale is set to "It". # Predefined sets can be used within character set definitions. 1/[\d,]+/ 2/[^\a\d]+/ # Anything but a letter or a digit "543,21" 1 {543,21} {} {} {} {} {} {} {} {} "100.31" 1 {100} {} {} {} {} {} {} {} {} "abc@3" 2 {@} {} {} {} {} {} {} {} {} "xyzwt" 0 {} {} {} {} {} {} {} {} {} # * Recognizers # Some commonly used patterns have been predefined: # . any character (except '\0'). Depending on the switch \E it # can also match an escaped character. # \Q quoted string. Equivalent to: # ("(\\.|[^"])*"|'(\\.|[^'])*') # \N decimal integer number. Equivalent to: # [+-]?\d+ # \H hex integer number. Equivalent to: # (0x)?\h+ # \F floating point number. Equivalent to: # [+-]?\d*(.\d*)?([eE][+-]?\d*(.\d*))? # but doesn't match the empty string # \L generalized newline. Equivalent to: # (\r\n?|\n) # \I identifier. Equivalent to: # [\a_]\w+ # \Bxy balanced braces. The character x and y must be # different. For example %B() matches "(f(g(x))". # \W white space. Equivalent to: # [ \t]* # # \A # \Z # * Grouping # Expressions can be grouped by parenthesis. If "X" is an expression, "(X)" # is a pattern that defines the same set of strings as "X". 1/(a[bc])/ 2/ab/ # this will never match! 3/\l\l/ "ab" 1 {ab} {} {} {} {} {} {} {} {} "ac" 1 {ac} {} {} {} {} {} {} {} {} "bc" 3 {bc} {} {} {} {} {} {} {} {} "00" 0 {} {} {} {} {} {} {} {} {} # Within a group it is possible to define alternative patterns. # If "X" and "Y" are patterns, "(X|Y)" is a pattern that defines # the set of strings that match "X" or "Y". 1/(ab|ac)/ 2/\l\l/ "ab" 1 {ab} {} {} {} {} {} {} {} {} "ac" 1 {ac} {} {} {} {} {} {} {} {} "bc" 2 {bc} {} {} {} {} {} {} {} {} # * Operators # The following operators can be applied to a pattern X: # ^X matches X at the beginning of the input string # X$ matches X at the end of the input string # X* matches 0 or more repetition of X # X+ matches 1 or more repetition of X # X? matches X or the empty string # X matches m to n repetition of X # X matches m repetition of X # X matches m or more repetition of X # X<,n> matches 0 to n repetition of X # X! matches the empty string if X doesn't match # X& matches the empty string if X matches # X# matches any string whose end matches X 1/^\l\l/ # two lowercase letters at the beginning of the string 2/\l\l$/ # two lowercase letters at the end of the string 3/\l\l/ # two lowercase letters anywhere in the string "xx<<" 1 {xx} {} {} {} {} {} {} {} {} ">>zy" 2 {zy} {} {} {} {} {} {} {} {} ">wy<" 3 {wy} {} {} {} {} {} {} {} {} "q11p" 0 {} {} {} {} {} {} {} {} {} "db" 1 {db} {} {} {} {} {} {} {} {} 1/.<2,3>/ "a" 0 {} {} {} {} {} {} {} {} {} "ab" 1 {ab} {} {} {} {} {} {} {} {} "abc" 1 {abc} {} {} {} {} {} {} {} {} "abcd" 1 {abc} {} {} {} {} {} {} {} {} 1/.<0,3>/ "a" 1 {a} {} {} {} {} {} {} {} {} "ab" 1 {ab} {} {} {} {} {} {} {} {} "abc" 1 {abc} {} {} {} {} {} {} {} {} "abcd" 1 {abc} {} {} {} {} {} {} {} {} 1/.<2,0>/ "a" 0 {} {} {} {} {} {} {} {} {} "ab" 1 {ab} {} {} {} {} {} {} {} {} "abc" 1 {abc} {} {} {} {} {} {} {} {} "abcd" 1 {abcd} {} {} {} {} {} {} {} {} 1/,\d!/ # a comma not followed by a digit 2/\a:&/ # a letter followed by a colon ",x" 1 {,} {} {} {} {} {} {} {} {} ",3" 0 {} {} {} {} {} {} {} {} {} ".3" 0 {} {} {} {} {} {} {} {} {} "x:" 2 {x} {} {} {} {} {} {} {} {} "9:" 0 {} {} {} {} {} {} {} {} {} "x." 0 {} {} {} {} {} {} {} {} {} 1/''#/ # string delimited by a single quote "''" 1 {''} {} {} {} {} {} {} {} {} "'aaa'" 1 {'aaa'} {} {} {} {} {} {} {} {} "'aaa'bbb'" 1 {'aaa'} {} {} {} {} {} {} {} {} # BEWARE! Since the YRX patterns are always greedy, you can't have # alternates pattern with one of the patterns matching the empty # string. 1/(a*|b)+/ "aaaa" 1 {aaaa} {} {} {} {} {} {} {} {} "aaaababab" 1 {aaaa} {} {} {} {} {} {} {} {} 1/(a*b?)+/ "aaaa" 1 {aaaa} {} {} {} {} {} {} {} {} "aaaababab" 1 {aaaababab} {} {} {} {} {} {} {} {} # * Special Characters # If a special character is to be matched literally, it must be # escaped with a backslash. Except if they are inside a character set. 1/\[.[|*#^$&!()?+{}<>\\]+\]/ "[*))((*]" 1 {[*))((*]} {} {} {} {} {} {} {} {} # * Captures # Part of the matching string can be captured and returned among results. # There can be up to 8 captures. # { begin capture # } end capture # \1 reference 1st capture # \2 reference 2nd capture # ... # \8 reference 8th capture 1/{\a}\1/ # twice the same letter 2/{\a\a}\1/ # twice the same couple of letters "xaax" 1 {aa} {a} {} {} {} {} {} {} {} "xaxa" 2 {xaxa} {xa} {} {} {} {} {} {} {} "abcd" 0 {} {} {} {} {} {} {} {} {} 1/{.}\1#/ # a string delimited by its first character "'abcd'" 1 {'abcd'} {'} {} {} {} {} {} {} {} "" 0 {} {} {} {} {} {} {} {} {} "$$" 1 {$$} {$} {} {} {} {} {} {} {} 1/:{\l{\d+}}:/ # nested captures ":a123:" 1 {:a123:} {a123} {123} {} {} {} {} {} {} 1/a{({bc}|{bd})}/ "abc" 1 {abc} {bc} {bc} {} {} {} {} {} {} "abd" 1 {abd} {bd} {} {bd} {} {} {} {} {} "xbd" 0 {} {} {} {} {} {} {} {} {} "abe" 0 {} {} {} {} {} {} {} {} {} 1/\y*{\l+}\({[?\)]*}\)/ "xx"0 {} {} {} {} {} {} {} {} {} # * Switch # A switch turns on and off a specific feature of the pattern matchin # engine. # \C case sensitive match on/off # \Ex turns on escaped match for the "." recognizer # \e turns off escaped match 1/abcd/ 2/ab\Ccd/ "abcd" 1 {abcd} {} {} {} {} {} {} {} {} "abCd" 2 {abCd} {} {} {} {} {} {} {} {} "ABcd" 0 {} {} {} {} {} {} {} {} {} 1/\E\\''#/ # Remember that "X#"" is equivalente to "(X!.)*X" 2/\E`{.}\1#/ "'abcde'" 1 {'abcde'} {} {} {} {} {} {} {} {} "$xyz$" 2 {$xyz$} {$} {} {} {} {} {} {} {} "$xyz`$abc$" 2 {$xyz`$abc$} {$} {} {} {} {} {} {} {} "abcd" 0 {} {} {} {} {} {} {} {} {} 1/\E$'{[^']*}'\e{[^']*}'/ "'ab'cd'" 1 {'ab'cd'} {ab} {cd} {} {} {} {} {} {} "'ab$'cd'ef'" 1 {'ab$'cd'ef'} {ab$'cd} {ef} {} {} {} {} {} {} "'ab$'cd'ef$'gh'" 1 {'ab$'cd'ef$'} {ab$'cd} {ef$} {} {} {} {} {} {}