1:- module(pac_regex, [parse_regex/2,
    2		      regex_unix_macro/2]).    3
    4% [2015/02/17]
    5
    6regex_unix_macro($, @( $ )).
    7regex_unix_macro(^, []).
    8
    9% ?- parse_regex("a", X).
   10% ?- parse_regex("a|b", X).
   11% ?- parse_regex("(abc)*", X).
   12% ?- parse_regex("([% \t]|(%@*)", X).
   13% ?- parse_regex("([% \t]|(%@*)|(\\?-))*", X).
   14% ?- parse_regex("[^\n]*$", X).
   15
   16			/*********************
   17			*     parse regex    *
   18			*********************/
   19% %
   20% dcg_d((.), [A|B], B, [A]).
   21% dcg_d([A], [A|B], B, [A]).
   22% dcg_d({G},  A,    A, {G}).
   23% dcg_d(X, A, B, X0):-  pacx:complete_args(X, [A, B], X0).
   24
   25% ?- parse_regex("ab", R).
   26% ?- parse_regex("a\\*",R).
   27% ?- parse_regex("a\\\*",R).  % <== intentional Syntex error
   28% ?- parse_regex("a\\\\*",R).
   29% ?- parse_regex("a\\\\\\*",R).
   30% ?- parse_regex("[abc\\]]*",R).
   31% ?- parse_regex("[abc\\\\]]*",R).
   32% ?- parse_regex("a", R).
   33% ?- parse_regex("(a)", R).
   34% ?- parse_regex("(abc)", R).
   35% ?- parse_regex("[abc]", R).
   36% ?- parse_regex("[abc]*",R).
   37% ?- parse_regex("[^abc]", R).
   38% ?- parse_regex("(.*)", R).
   39% ?- parse_regex("(a*)", R).
   40% ?- parse_regex("a*b", R).
   41% ?- parse_regex(".", R).
   42% ?- parse_regex("[^a-zA-Z]",R).
   43% ?- parse_regex("a|b|c", R).
   44% ?- parse_regex("(a|b|c)**", R).
   45% ?- parse_regex("(\\(*[a]|1)",R).
   46% ?- parse_regex("abc",R).
   47% ?- parse_regex("[a]",R).
   48% ?- parse_regex("[ab]",R).
   49% ?- parse_regex("[a-b]",R).
   50% ?- parse_regex(".",R).
   51% ?- parse_regex("\\.",R).
   52% ?- parse_regex("a",R).
   53% ?- parse_regex("[^\n]",R).
   54% ?- parse_regex("$",R).
   57parse_regex(X, Y) :- string_chars(X, X0),
   58	once(parse_regex([], [], [], Y0, X0, [])),
   59	Y0 = [Y].
   60%
   61parse_regex(A, B, A0, B0) --> token(T), !,
   62	{ once(push_pop_stacks(T, A, B, A1, B1)) },
   63	parse_regex(A1, B1, A0, B0).
   64parse_regex(A, B, A0, B0) --> % end of the regex
   65	{ close_block(A, B, A0, B1),
   66	  fold_block_reversely(B1, B0) }.
   67
   68% standard push/pop actions from  operator-precedence grammars
   69push_pop_stacks(')', A, X, B, Y) :-  close_block(A, X, B, Z),	% block close
   70	fold_block_reversely(Z,  Y).
   71push_pop_stacks('(', A, X, ['('|A], ['('|X]).	% block open
   72push_pop_stacks({}(I), A, [T|X],  A, [T^I|X]).	% repeat spec
   73push_pop_stacks(F, A, X, A, Y) :-  unary_opr(F),
   74	apply_unary_opr(F, X, Y).
   75push_pop_stacks(F, A, X, [F|B], Y) :- binary_opr(F),
   76	once(sweep_higher_opr(F, A, X, B, Y)).
   77push_pop_stacks(T, A, X, A, [T|X]).
   78
   79%
   80token('C'([C])) --> [(\)], [C0], {char_code(C0, C)}.  % escape charcter
   81token(X) --> [ A ], { regex_unix_macro(A, X) }.
   82token(T) --> [T], {memberchk(T, ['(',')', '*', '+', '!', '|', '.', '?'])}.
   83token(out(D))	--> ['[', ^],  char_class(D0, []), {chars_interval(D0, D)}.
   84token(dot(D))	--> ['['], char_class(D0, []), {chars_interval(D0, D)}.
   85token({}(I))	--> ['{'], repeat_spec(I).
   86token('C'([C]))	--> [C0], {char_code(C0, C)}.
   87
   88%
   89sweep_higher_opr(_, [], X, [], X).
   90sweep_higher_opr(_, ['('|X], Y, ['('|X], Y).
   91sweep_higher_opr(F, [G|A], X, B, Y):- higher_priority(G, F),
   92	apply_binary_opr(G, X, Z),
   93	sweep_higher_opr(F, A, Z, B, Y).
   94sweep_higher_opr(F, A, X, B, Y):- push_pop_stacks(F, A, X, B, Y).
   95
   96%
   97close_block([], X, [], X).
   98close_block(['('|A], X, A, X).
   99close_block([F|A], X, B, Y):-
  100	apply_binary_opr(F, X, Z),
  101	close_block(A, Z, B, Y).
  102
  103% ?- fold_block_reversely([a, c, '(', b], X).
  104fold_block_reversely([X|Y], [Z|U]):- fold_block_reversely(X, Y, Z, U).
  105fold_block_reversely([], []).
  106
  107%
  108fold_block_reversely(X, [], X, []).
  109fold_block_reversely(X, ['('|Y], X, Y).
  110fold_block_reversely(X, [Y|Z], U, V):- fold_block_reversely(Y+X, Z, U, V).
  111
  112%
  113unary_opr(*).
  114unary_opr(?).
  115unary_opr(!).
  116unary_opr(+).
  117
  118%
  119binary_opr(&).
  120binary_opr('|').
  121
  122% only for binary operators
  123higher_priority(&, &).
  124higher_priority(&, '|').
  125higher_priority('|', '|').
  126
  127%
  128apply_unary_opr(*, [X|Y], [*(X)|Y]).
  129apply_unary_opr(+, [X|Y], [+(X)|Y]).
  130apply_unary_opr(?, [X|Y], [?(X)|Y]).
  131apply_unary_opr(!, [X|Y], [!(X)|Y]).
  132
  133%
  134apply_binary_opr('|', [X, Y|Z], ['|'(Y, X)|Z]).
  135apply_binary_opr('&', [X, Y|Z], ['&'(Y, X)|Z]).
  136
  137% ?- repeat_spec(X, ['1', '2', ',', '3', '4', '}'], Y).
  138%@ X = 12-34
  139% ?- repeat_spec(X, ['1', '2', ',','}'], Y).
  140%@ X = >=(12)
  141
  142repeat_spec(X) --> number_chars(J0), [','],   { J0\==[] },
  143	 number_chars(K0), ['}'],
  144	{ 	number_chars(J, J0),
  145		( K0 \== []
  146		->	number_chars(K, K0),
  147			X = J-K
  148		;	X = (>=(J))
  149		)
  150	}.
  151
  152%
  153number_chars([D|Ds]) --> [D], {char_type(D, digit)},
  154	number_chars(Ds).
  155number_chars([]) --> [].
  156
  157%
  158char_class([\(C)|Y], Z)	--> [\], [C], char_class(Y,Z).
  159char_class([A|X], Y)	--> [A, A], char_class(X, Y).  % "idempotent" law
  160char_class(X, X)		--> [']'].
  161char_class([C|Y], Z)	--> [C], char_class(Y,Z).
  162
  163%
  164chars_interval([],[]).
  165chars_interval([X, -, Y|R], [C - D|S]):- drop_escape(X, X0),
  166	drop_escape(Y, Y0),
  167	char_code(X0, C),
  168	char_code(Y0, D),
  169	chars_interval(R, S).
  170chars_interval([X|R], [C|S]):- drop_escape(X, X0),
  171	char_code(X0, C),
  172	chars_interval(R, S).
  173
  174%
  175drop_escape(\(X), X).
  176drop_escape(X, X)