2% DCTG grammar for stochastic regular expressions
    3% Feb/99
    4%
    5% Grammar:  a | E:F  |  [a:E1(N1)+b:E2(N2)]  |  E*(Pr)  |  E+(Pr)
    6%
    7% epsilon e:  	 epsilon (not allowed in constructs; equiv. to [] generation)
    8% action:  	 a 
    9% concatanation: E:F			Pr(E:F) = Pr(E)*Pr(F)
   10% choice:	 [E1(N1)+b:E2(N2)]	Pr(Ei(Ni)) = Ni / N1+N2
   11% Kleene star:   E*(P)			Pr(E*) = [epsilon (1-P), (E:E*)(P)] 
   12% Kleene plus:   E+(P)			Pr(E+) = Pr(E:(E*(P)))
   13%
   14% where: Ni = integers >= 0,  P is probability (0 <= P < 1)
   15%
   16% Also: (a) no directly nested iteration allowed (star, plus)
   17%       (b) choice must have at least 2 terms
   18%
   19% Semantic rules:
   20% construct(E): makes a Prolog structure equivalent of expression
   21% generate(S,SL1,SL2,P): randomly interprets an expression, giving output 
   22%	string S (as list) with associated probability P. 
   23%	Note that repetition (star, plus) are executed according to 
   24%	probability, as well as for a max. generated string length.
   25%	When length met, then no repetitions possible (max_string_length_P/1)
   26%	Resulting string may unavoidably exceed this max length.
   27% raw_generate(S,SL1,SL2): like generate, but no probability computed.
   28% recognize(S,P): finds way of recognizing a string with the
   29% 	expression, resulting in probability P. Will work with backtracking,
   30%	so that all possibile derivations can be found (no probabilistic
   31%	execution as with generate/2.
   32
   33
   34expr ::= iter_expr^^A
   35<:>
   36(construct(E) ::- A^^construct(E)),
   37(raw_generate(S, SL1, SL2) ::- A^^raw_generate(S, SL1, SL2)),
   38(recognize(S, S2, PrSoFar, Pr) ::- 
   39	check_prob(PrSoFar), 
   40	A^^recognize(S, S2, PrSoFar, Pr)).
   41
   42expr ::= noniter_expr^^A
   43<:>
   44(construct(E) ::- A^^construct(E)),
   45(raw_generate(S, SL1, SL2) ::- A^^raw_generate(S, SL1, SL2)),
   46(recognize(S, S2, PrSoFar, Pr) ::- 
   47	check_prob(PrSoFar), 
   48	A^^recognize(S, S2, PrSoFar, Pr)).
   49
   50% -------------------------------------
   51
   52noniter_expr ::= [a]   % action a
   53<:>
   54construct(a),
   55(raw_generate([a], SL1, SL2) ::- SL2 is SL1+1),
   56(recognize([a|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
   57
   58noniter_expr ::= [b]   % action b
   59<:>
   60construct(b),
   61(raw_generate([b], SL1, SL2) ::- SL2 is SL1+1),
   62(recognize([b|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
   63
   64% - - - -
   65
   66noniter_expr ::= guardedexpr_a^^A1, intval^^B1, guardedexpr_b^^A2, intval^^B2
   67<:>
   68(construct([(E1,N1),(E2,N2)]) ::-  
   69	A1^^construct(E1), 
   70	B1^^construct(N1),
   71	A2^^construct(E2), 
   72	B2^^construct(N2)),
   73(raw_generate(S, SL1, SL2) ::- 
   74	B1^^construct(N1), B2^^construct(N2),	
   75	(raw_select_term([N1,N2], 1) ->
   76		A1^^raw_generate(S, SL1, SL2)
   77		;
   78		A2^^raw_generate(S, SL1, SL2))),
   79(recognize(S, S2, PrSoFar, Pr) ::- 
   80	B1^^construct(Val1), B2^^construct(Val2),
   81	Pr2 is PrSoFar*(Val1/(Val1+Val2)),
   82	check_prob(Pr2),
   83	A1^^recognize(S, S2, Pr2, Pr)),
   84(recognize(S, S2, PrSoFar, Pr) ::- 
   85	B1^^construct(Val1), B2^^construct(Val2),
   86	Pr2 is PrSoFar*(Val2/(Val1+Val2)),
   87	check_prob(Pr2),
   88	A2^^recognize(S, S2, Pr2, Pr)).
   89
   90% - - - -
   91
   92noniter_expr ::= expr^^A, expr^^B  % concat
   93<:>
   94(construct((E:F)) ::- A^^construct(E), B^^construct(F)),
   95(raw_generate(S, SL1, SL2) ::- 
   96	A^^raw_generate(S1, SL1, SL3), 
   97	B^^raw_generate(S2, SL3, SL2), 
   98	append(S1, S2, S)),
   99(recognize(S, S2, PrSoFar, Pr) ::- 
  100	check_prob(PrSoFar),
  101	A^^recognize(S, S3, PrSoFar, Pr1), 
  102	check_prob(Pr1),
  103	B^^recognize(S3, S2, Pr1, Pr)).
  104
  105% -------------------------------------
  106
  107iter_expr ::= noniter_expr^^A, probval^^B    % star
  108<:>
  109(construct((E)*(P)) ::- A^^construct(E), B^^construct(P)),
  110(raw_generate(S, SL1, SL2) ::- 
  111	B^^construct(P),
  112	max_string_length_P(MaxL),
  113	raw_gen_loop(A, P, MaxL, S, SL1, SL2)),
  114(recognize(S, S2, PrSoFar, Pr) ::- 
  115	check_prob(PrSoFar),
  116	B^^construct(Pr1),
  117	recognize_loop(A, Pr1, S, S2, PrSoFar, Pr)).
  118
  119% - - - -
  120
  121iter_expr ::= noniter_expr^^A, probval^^B    % plus
  122<:>
  123(construct((E)+(P)) ::- A^^construct(E), B^^construct(P)),
  124(raw_generate(S, SL1, SL2) ::- 
  125	A^^raw_generate(S1, SL1, SL3),
  126	B^^construct(P),
  127	max_string_length_P(MaxL),
  128	raw_gen_loop(A, P, MaxL, S2, SL3, SL2),
  129	append(S1, S2, S),
  130	!),
  131(recognize(S, S2, PrSoFar, Pr) ::- 
  132	check_prob(PrSoFar),
  133	A^^recognize(S, S3, PrSoFar, Pr1),
  134	\+ (S=S3), % new
  135	check_prob(Pr1),
  136	B^^construct(Pr2),
  137	recognize_loop(A, Pr2, S3, S2, Pr1, Pr)).
  138
  139% -------------------------------------
  140
  141guardedexpr_a ::= [a]   % action a
  142<:>
  143construct(a),
  144(raw_generate([a], SL1, SL2) ::- SL2 is SL1+1),
  145(recognize([a|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
  146
  147guardedexpr_a ::= [a], expr^^A  % concat
  148<:>
  149(construct((a:E)) ::- A^^construct(E)),
  150(raw_generate([a|S], SL1, SL2) ::- 
  151	A^^raw_generate(S, SL1, SL3), 
  152	SL2 is SL3+1),
  153(recognize([a|S], S2, PrSoFar, Pr) ::- 
  154	check_prob(PrSoFar),
  155	A^^recognize(S, S2, PrSoFar, Pr)).
  156
  157guardedexpr_b ::= [b]   % action b
  158<:>
  159construct(b),
  160(raw_generate([b], SL1, SL2) ::- SL2 is SL1+1),
  161(recognize([b|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
  162
  163guardedexpr_b ::= [b], expr^^A  % concat
  164<:>
  165(construct((b:E)) ::- A^^construct(E)),
  166(raw_generate([b|S], SL1, SL2) ::- 
  167	A^^raw_generate(S, SL1, SL3), 
  168	SL2 is SL3+1),
  169(recognize([b|S], S2, PrSoFar, Pr) ::- 
  170	check_prob(PrSoFar),
  171	A^^recognize(S, S2, PrSoFar, Pr)).
  172
  173
  174% -------------------------------------
  175
  176intval ::= [N], { is_an_integer(N) }
  177<:>
  178construct(N).
  179
  180probval ::= [R], { is_a_probability(R)}
  181<:>
  182construct(R).
  183
  184% ------------------------------------
  185% Prolog utilities...
  186% ------------------------------------
  187
  188% is_an_integer(N):
  189%	N - integer value
  190% Succeeds if N is an integer. If N is variable, a random integer 
  191% in desired range is created.
  192
  193is_an_integer(N) :-
  194	integer(N),
  195	!.
  196is_an_integer(N) :-
  197	int_range(Low, High),
  198	random(Low, High, N).
  199
  200int_range(0, 1000).
  201
  202% ------------------------------------
  203% is_a_probability(R):
  204%	R - real value 0.0 <= R < 1.0
  205% Succeeds if R is a float,  0.0 <= R < 1.0. 
  206% If R is variable, a random probability in desired range is created.
  207
  208is_a_probability(R) :-
  209	float(R),
  210	!.
  211is_a_probability(R) :-
  212	random(T),
  213	R is truncate(T*100)/100.
  214
  215% ------------------------------------
  216% raw_select_term(L, K):
  217%	L - list of probability weights
  218%	K - kth term selected via prob. weighting (between 1 and length(L))
  219%	Pr - calculated probability of selected term
  220% Like select_term, but no probability computed.
  221
  222raw_select_term(L, K) :-
  223	sumlist(L, SL, 0, Sum),
  224	random(0, Sum, X),
  225	select_kth_term(SL, X, 1, K, _),
  226	!.
  227
  228
  229% ------------------------------------
  230% sumlist(A, B, L, S):
  231%	A - list of weights
  232%	B - summed list of weights (roulette wheel)
  233%	L - sum so far
  234%	S - final sum
  235% Creates a summed list of prob weights, with final total S.
  236
  237sumlist([], [], Sum, Sum).
  238sumlist([N|R], [NewSum|SumList2], LastSum, Sum) :-
  239	NewSum is LastSum + N,
  240	sumlist(R, SumList2, NewSum, Sum).
  241
  242% ------------------------------------
  243% select_kth_term(W, Val, SoFar, K, Val)
  244%	W - list of summed weights (roulette wheel)
  245%	Val - random value in wheel to use
  246%	SoFar - counter
  247%	K - selected term according to Val on W
  248%	Val - value of selected term
  249
  250select_kth_term([Val], _, K, K, Val) :- !.  
  251select_kth_term([Val|_], X, K, K, Val) :-
  252	Val >= X, 
  253	!.
  254select_kth_term([_|R], X, K, K2, Val) :-
  255	K3 is K + 1,
  256	select_kth_term(R, X, K3, K2, Val).
  257
  258% ------------------------------------
  259% raw_gen_loop(Tree, Pr, MaxL, S, SL1, SL2):
  260%	Tree - grammar tree to process
  261%	Pr - probability of doing an iteration
  262%	MaxL - max length of generated string for terminating looping
  263%	S - final generated string
  264%	SL1, SL2 - current length and final length of generated string
  265% Like gen_loop, but no probabilities computed.
  266
  267raw_gen_loop(Tree, Pr, MaxL, S, SL1, SL2) :-
  268	SL1 < MaxL,
  269	maybe(Pr),
  270	Tree^^raw_generate(S1, SL1, SL3),
  271	raw_gen_loop(Tree, Pr, MaxL, S2, SL3, SL2),
  272	append(S1, S2, S),
  273	!.
  274raw_gen_loop(Tree, Pr, _, [], SL, SL) :- 
  275	!.
  276
  277
  278% ------------------------------------
  279% recognize_loop(Tree, Pr, S, S2, FinalPr):
  280%	Tree - grammar tree to process
  281%	Pr - probability of doing an iteration
  282%	S, S2 - string to recognize (before, after)
  283%	FinalPr - final probability of execution
  284% recognize_loop performs successive iterations of an iterative expression.
  285% Attempts to recognize S, computing probability each time. 
  286% No limit to number of iterations, other than the ability to consume S.
  287%	epsilon
  288%	E
  289%	E:E
  290%	E:E:E  etc.
  291% As soon as an iteration fails to consume (after backtracking as well), 
  292% then iteration quits.
  293
  294recognize_loop(_, Pr, [], [], PrSoFar, FinalPr) :- % new
  295	!,
  296	FinalPr is PrSoFar*(1.0 - Pr),
  297	check_prob(FinalPr).
  298recognize_loop(T, Pr, S, S, PrSoFar, FinalPr) :-
  299	FinalPr is PrSoFar*(1.0 - Pr),
  300	check_prob(FinalPr).
  301recognize_loop(Tree, Pr, S, S2, PrSoFar, FinalPr) :-
  302	Pr3 is PrSoFar*Pr,
  303	check_prob(Pr3),
  304	Tree^^recognize(S, S3, Pr3, Pr1),
  305	\+(S=S3),
  306	check_prob(Pr1),
  307	recognize_loop(Tree, Pr, S3, S2, Pr1, FinalPr).
  308
  309check_prob(P) :-
  310	min_grammar_prob_P(E),
  311	P > E,
  312	!.
  313
  314% ------------------------------------
  315% for testing...
  316
  317sre(Type, Expr, String, SL) :-
  318	repeat,
  319	(Type = full ; Type = grow),
  320	generate_tree(expr, Type, 12, _, Tree, _),
  321	Tree^^construct(Expr),
  322	Tree^^raw_generate(String, 0, SL),
  323	nl,sre_pp(Expr),nl,
  324	write('tree '),write(Tree),nl,
  325	tree_depth(Tree, Depth),
  326	write('Depth = '), write(Depth),nl.
  327
  328sre2(Type, Expr, Input) :-
  329	repeat,
  330	(Type = full ; Type = grow),
  331	generate_tree(expr, Type, 12, _, Tree, _),
  332	Tree^^construct(Expr),
  333	nl, write(Type),
  334	nl,sre_pp(Expr),nl,
  335	bagof((Leftover,Pr),Tree^^recognize(Input, Leftover, 1.0, Pr), Rlist),
  336	write('Recog list: '), nl, writelist(Rlist), nl.
  337
  338sre2c(Type, Expr, Input) :-
  339	repeat,
  340	(Type = full ; Type = grow),
  341	generate_tree(expr, Type, 12, _, Tree, _),
  342	Tree^^construct(Expr),
  343	nl, write(Type),
  344	nl,sre_pp(Expr),nl,
  345	bagof(Pr,Tree^^recognize(Input, [],  1.0, Pr), Prlist),
  346	write('Pr list: '), nl, writelist(Prlist), nl.
  347
  348sre2b(Input) :-
  349	generate_tree(expr, grow, 8, _, Tree, _),
  350	Tree^^construct(Expr),
  351	Tree^^recognize(Input, Leftover, 1.0, Pr),
  352	nl,sre_pp(Expr),nl,
  353	write('Prob = '), write(Pr), nl,
  354	write('Leftover = '), write(Leftover), nl.
  355
  356% ------------------------------------
  357% sre pretty printer
  358
  359sre_pp((E*R)) :-
  360	write('('),
  361	sre_pp(E),
  362	write(')*'),
  363	write(R),
  364	!.
  365sre_pp((E+R)) :-
  366	write('('),
  367	sre_pp(E),
  368	write(')+'),
  369	write(R),
  370	!.
  371sre_pp((E:F)) :-
  372	sre_pp(E),
  373	write(':'),
  374	sre_pp(F),
  375	!.
  376sre_pp([A|T]) :-
  377	sre_pp_l([A|T]),
  378	!.
  379sre_pp((A,B)) :-
  380	write('('),
  381	sre_pp(A),
  382	write(','),
  383	write(B),
  384	write(')'),
  385	!.
  386sre_pp(X) :-
  387	write(X).
  388
  389sre_pp_l([A]) :-
  390	sre_pp(A),
  391	!.
  392sre_pp_l([A|T]) :-
  393	write('['),
  394	sre_pp(A),
  395	write('+'),
  396	sre_pp_l(T),
  397	write(']'),
  398	!.
  399
  400% ------------------------------------
  401% solution dump: writes soln expression to a file, for input later.
  402% Grammatical expression is written in multiple lines, since the full
  403% expression is often larger than Prolog's builtin "write" can handle.
  404
  405/*
  406write_soln(Run, E) :-
  407	set_file_name("soln", Run, File),
  408	tell(File),
  409	write('soln('),
  410	write_term(E),
  411	write(').'),
  412	nl,
  413	told,
  414	tell(user),
  415	!.
  416
  417write_term(node(X,List,Y)) :-
  418	!,
  419	write('node('),
  420	write(X),
  421	write(',['),
  422	write_tlist(List),
  423	write('],'),
  424	write(Y),
  425	write(')').
  426write_term(X) :- write(X).
  427
  428write_tlist([]) :- !.
  429write_tlist([X,Y|Z]) :-
  430	!,
  431	write_term(X),
  432	write(','),
  433	nl,
  434	write_tlist([Y|Z]).
  435write_tlist([X]) :-
  436	write_term(X).
  437*/