```    2% DCTG grammar for stochastic regular expressions
3% Feb/99
4%
5% Grammar:  a | E:F  |  [a:E1(N1)+b:E2(N2)]  |  E*(Pr)  |  E+(Pr)
6%
7% epsilon e:  	 epsilon (not allowed in constructs; equiv. to [] generation)
8% action:  	 a
9% concatanation: E:F			Pr(E:F) = Pr(E)*Pr(F)
10% choice:	 [E1(N1)+b:E2(N2)]	Pr(Ei(Ni)) = Ni / N1+N2
11% Kleene star:   E*(P)			Pr(E*) = [epsilon (1-P), (E:E*)(P)]
12% Kleene plus:   E+(P)			Pr(E+) = Pr(E:(E*(P)))
13%
14% where: Ni = integers >= 0,  P is probability (0 <= P < 1)
15%
16% Also: (a) no directly nested iteration allowed (star, plus)
17%       (b) choice must have at least 2 terms
18%
19% Semantic rules:
20% construct(E): makes a Prolog structure equivalent of expression
21% generate(S,SL1,SL2,P): randomly interprets an expression, giving output
22%	string S (as list) with associated probability P.
23%	Note that repetition (star, plus) are executed according to
24%	probability, as well as for a max. generated string length.
25%	When length met, then no repetitions possible (max_string_length_P/1)
26%	Resulting string may unavoidably exceed this max length.
27% raw_generate(S,SL1,SL2): like generate, but no probability computed.
28% recognize(S,P): finds way of recognizing a string with the
29% 	expression, resulting in probability P. Will work with backtracking,
30%	so that all possibile derivations can be found (no probabilistic
31%	execution as with generate/2.
32
33
34expr ::= iter_expr^^A
35<:>
36(construct(E) ::- A^^construct(E)),
37(raw_generate(S, SL1, SL2) ::- A^^raw_generate(S, SL1, SL2)),
38(recognize(S, S2, PrSoFar, Pr) ::-
39	check_prob(PrSoFar),
40	A^^recognize(S, S2, PrSoFar, Pr)).
41
42expr ::= noniter_expr^^A
43<:>
44(construct(E) ::- A^^construct(E)),
45(raw_generate(S, SL1, SL2) ::- A^^raw_generate(S, SL1, SL2)),
46(recognize(S, S2, PrSoFar, Pr) ::-
47	check_prob(PrSoFar),
48	A^^recognize(S, S2, PrSoFar, Pr)).
49
50% -------------------------------------
51
52noniter_expr ::= [a]   % action a
53<:>
54construct(a),
55(raw_generate([a], SL1, SL2) ::- SL2 is SL1+1),
56(recognize([a|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
57
58noniter_expr ::= [b]   % action b
59<:>
60construct(b),
61(raw_generate([b], SL1, SL2) ::- SL2 is SL1+1),
62(recognize([b|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
63
64% - - - -
65
66noniter_expr ::= guardedexpr_a^^A1, intval^^B1, guardedexpr_b^^A2, intval^^B2
67<:>
68(construct([(E1,N1),(E2,N2)]) ::-
69	A1^^construct(E1),
70	B1^^construct(N1),
71	A2^^construct(E2),
72	B2^^construct(N2)),
73(raw_generate(S, SL1, SL2) ::-
74	B1^^construct(N1), B2^^construct(N2),
75	(raw_select_term([N1,N2], 1) ->
76		A1^^raw_generate(S, SL1, SL2)
77		;
78		A2^^raw_generate(S, SL1, SL2))),
79(recognize(S, S2, PrSoFar, Pr) ::-
80	B1^^construct(Val1), B2^^construct(Val2),
81	Pr2 is PrSoFar*(Val1/(Val1+Val2)),
82	check_prob(Pr2),
83	A1^^recognize(S, S2, Pr2, Pr)),
84(recognize(S, S2, PrSoFar, Pr) ::-
85	B1^^construct(Val1), B2^^construct(Val2),
86	Pr2 is PrSoFar*(Val2/(Val1+Val2)),
87	check_prob(Pr2),
88	A2^^recognize(S, S2, Pr2, Pr)).
89
90% - - - -
91
92noniter_expr ::= expr^^A, expr^^B  % concat
93<:>
94(construct((E:F)) ::- A^^construct(E), B^^construct(F)),
95(raw_generate(S, SL1, SL2) ::-
96	A^^raw_generate(S1, SL1, SL3),
97	B^^raw_generate(S2, SL3, SL2),
98	append(S1, S2, S)),
99(recognize(S, S2, PrSoFar, Pr) ::-
100	check_prob(PrSoFar),
101	A^^recognize(S, S3, PrSoFar, Pr1),
102	check_prob(Pr1),
103	B^^recognize(S3, S2, Pr1, Pr)).
104
105% -------------------------------------
106
107iter_expr ::= noniter_expr^^A, probval^^B    % star
108<:>
109(construct((E)*(P)) ::- A^^construct(E), B^^construct(P)),
110(raw_generate(S, SL1, SL2) ::-
111	B^^construct(P),
112	max_string_length_P(MaxL),
113	raw_gen_loop(A, P, MaxL, S, SL1, SL2)),
114(recognize(S, S2, PrSoFar, Pr) ::-
115	check_prob(PrSoFar),
116	B^^construct(Pr1),
117	recognize_loop(A, Pr1, S, S2, PrSoFar, Pr)).
118
119% - - - -
120
121iter_expr ::= noniter_expr^^A, probval^^B    % plus
122<:>
123(construct((E)+(P)) ::- A^^construct(E), B^^construct(P)),
124(raw_generate(S, SL1, SL2) ::-
125	A^^raw_generate(S1, SL1, SL3),
126	B^^construct(P),
127	max_string_length_P(MaxL),
128	raw_gen_loop(A, P, MaxL, S2, SL3, SL2),
129	append(S1, S2, S),
130	!),
131(recognize(S, S2, PrSoFar, Pr) ::-
132	check_prob(PrSoFar),
133	A^^recognize(S, S3, PrSoFar, Pr1),
134	\+ (S=S3), % new
135	check_prob(Pr1),
136	B^^construct(Pr2),
137	recognize_loop(A, Pr2, S3, S2, Pr1, Pr)).
138
139% -------------------------------------
140
141guardedexpr_a ::= [a]   % action a
142<:>
143construct(a),
144(raw_generate([a], SL1, SL2) ::- SL2 is SL1+1),
145(recognize([a|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
146
147guardedexpr_a ::= [a], expr^^A  % concat
148<:>
149(construct((a:E)) ::- A^^construct(E)),
150(raw_generate([a|S], SL1, SL2) ::-
151	A^^raw_generate(S, SL1, SL3),
152	SL2 is SL3+1),
153(recognize([a|S], S2, PrSoFar, Pr) ::-
154	check_prob(PrSoFar),
155	A^^recognize(S, S2, PrSoFar, Pr)).
156
157guardedexpr_b ::= [b]   % action b
158<:>
159construct(b),
160(raw_generate([b], SL1, SL2) ::- SL2 is SL1+1),
161(recognize([b|T], T, PrSoFar, PrSoFar) ::- check_prob(PrSoFar)).
162
163guardedexpr_b ::= [b], expr^^A  % concat
164<:>
165(construct((b:E)) ::- A^^construct(E)),
166(raw_generate([b|S], SL1, SL2) ::-
167	A^^raw_generate(S, SL1, SL3),
168	SL2 is SL3+1),
169(recognize([b|S], S2, PrSoFar, Pr) ::-
170	check_prob(PrSoFar),
171	A^^recognize(S, S2, PrSoFar, Pr)).
172
173
174% -------------------------------------
175
176intval ::= [N], { is_an_integer(N) }
177<:>
178construct(N).
179
180probval ::= [R], { is_a_probability(R)}
181<:>
182construct(R).
183
184% ------------------------------------
185% Prolog utilities...
186% ------------------------------------
187
188% is_an_integer(N):
189%	N - integer value
190% Succeeds if N is an integer. If N is variable, a random integer
191% in desired range is created.
192
193is_an_integer(N) :-
194	integer(N),
195	!.
196is_an_integer(N) :-
197	int_range(Low, High),
198	random(Low, High, N).
199
200int_range(0, 1000).
201
202% ------------------------------------
203% is_a_probability(R):
204%	R - real value 0.0 <= R < 1.0
205% Succeeds if R is a float,  0.0 <= R < 1.0.
206% If R is variable, a random probability in desired range is created.
207
208is_a_probability(R) :-
209	float(R),
210	!.
211is_a_probability(R) :-
212	random(T),
213	R is truncate(T*100)/100.
214
215% ------------------------------------
216% raw_select_term(L, K):
217%	L - list of probability weights
218%	K - kth term selected via prob. weighting (between 1 and length(L))
219%	Pr - calculated probability of selected term
220% Like select_term, but no probability computed.
221
222raw_select_term(L, K) :-
223	sumlist(L, SL, 0, Sum),
224	random(0, Sum, X),
225	select_kth_term(SL, X, 1, K, _),
226	!.
227
228
229% ------------------------------------
230% sumlist(A, B, L, S):
231%	A - list of weights
232%	B - summed list of weights (roulette wheel)
233%	L - sum so far
234%	S - final sum
235% Creates a summed list of prob weights, with final total S.
236
237sumlist([], [], Sum, Sum).
238sumlist([N|R], [NewSum|SumList2], LastSum, Sum) :-
239	NewSum is LastSum + N,
240	sumlist(R, SumList2, NewSum, Sum).
241
242% ------------------------------------
243% select_kth_term(W, Val, SoFar, K, Val)
244%	W - list of summed weights (roulette wheel)
245%	Val - random value in wheel to use
246%	SoFar - counter
247%	K - selected term according to Val on W
248%	Val - value of selected term
249
250select_kth_term([Val], _, K, K, Val) :- !.
251select_kth_term([Val|_], X, K, K, Val) :-
252	Val >= X,
253	!.
254select_kth_term([_|R], X, K, K2, Val) :-
255	K3 is K + 1,
256	select_kth_term(R, X, K3, K2, Val).
257
258% ------------------------------------
259% raw_gen_loop(Tree, Pr, MaxL, S, SL1, SL2):
260%	Tree - grammar tree to process
261%	Pr - probability of doing an iteration
262%	MaxL - max length of generated string for terminating looping
263%	S - final generated string
264%	SL1, SL2 - current length and final length of generated string
265% Like gen_loop, but no probabilities computed.
266
267raw_gen_loop(Tree, Pr, MaxL, S, SL1, SL2) :-
268	SL1 < MaxL,
269	maybe(Pr),
270	Tree^^raw_generate(S1, SL1, SL3),
271	raw_gen_loop(Tree, Pr, MaxL, S2, SL3, SL2),
272	append(S1, S2, S),
273	!.
274raw_gen_loop(Tree, Pr, _, [], SL, SL) :-
275	!.
276
277
278% ------------------------------------
279% recognize_loop(Tree, Pr, S, S2, FinalPr):
280%	Tree - grammar tree to process
281%	Pr - probability of doing an iteration
282%	S, S2 - string to recognize (before, after)
283%	FinalPr - final probability of execution
284% recognize_loop performs successive iterations of an iterative expression.
285% Attempts to recognize S, computing probability each time.
286% No limit to number of iterations, other than the ability to consume S.
287%	epsilon
288%	E
289%	E:E
290%	E:E:E  etc.
291% As soon as an iteration fails to consume (after backtracking as well),
292% then iteration quits.
293
294recognize_loop(_, Pr, [], [], PrSoFar, FinalPr) :- % new
295	!,
296	FinalPr is PrSoFar*(1.0 - Pr),
297	check_prob(FinalPr).
298recognize_loop(T, Pr, S, S, PrSoFar, FinalPr) :-
299	FinalPr is PrSoFar*(1.0 - Pr),
300	check_prob(FinalPr).
301recognize_loop(Tree, Pr, S, S2, PrSoFar, FinalPr) :-
302	Pr3 is PrSoFar*Pr,
303	check_prob(Pr3),
304	Tree^^recognize(S, S3, Pr3, Pr1),
305	\+(S=S3),
306	check_prob(Pr1),
307	recognize_loop(Tree, Pr, S3, S2, Pr1, FinalPr).
308
309check_prob(P) :-
310	min_grammar_prob_P(E),
311	P > E,
312	!.
313
314% ------------------------------------
315% for testing...
316
317sre(Type, Expr, String, SL) :-
318	repeat,
319	(Type = full ; Type = grow),
320	generate_tree(expr, Type, 12, _, Tree, _),
321	Tree^^construct(Expr),
322	Tree^^raw_generate(String, 0, SL),
323	nl,sre_pp(Expr),nl,
324	write('tree '),write(Tree),nl,
325	tree_depth(Tree, Depth),
326	write('Depth = '), write(Depth),nl.
327
328sre2(Type, Expr, Input) :-
329	repeat,
330	(Type = full ; Type = grow),
331	generate_tree(expr, Type, 12, _, Tree, _),
332	Tree^^construct(Expr),
333	nl, write(Type),
334	nl,sre_pp(Expr),nl,
335	bagof((Leftover,Pr),Tree^^recognize(Input, Leftover, 1.0, Pr), Rlist),
336	write('Recog list: '), nl, writelist(Rlist), nl.
337
338sre2c(Type, Expr, Input) :-
339	repeat,
340	(Type = full ; Type = grow),
341	generate_tree(expr, Type, 12, _, Tree, _),
342	Tree^^construct(Expr),
343	nl, write(Type),
344	nl,sre_pp(Expr),nl,
345	bagof(Pr,Tree^^recognize(Input, [],  1.0, Pr), Prlist),
346	write('Pr list: '), nl, writelist(Prlist), nl.
347
348sre2b(Input) :-
349	generate_tree(expr, grow, 8, _, Tree, _),
350	Tree^^construct(Expr),
351	Tree^^recognize(Input, Leftover, 1.0, Pr),
352	nl,sre_pp(Expr),nl,
353	write('Prob = '), write(Pr), nl,
354	write('Leftover = '), write(Leftover), nl.
355
356% ------------------------------------
357% sre pretty printer
358
359sre_pp((E*R)) :-
360	write('('),
361	sre_pp(E),
362	write(')*'),
363	write(R),
364	!.
365sre_pp((E+R)) :-
366	write('('),
367	sre_pp(E),
368	write(')+'),
369	write(R),
370	!.
371sre_pp((E:F)) :-
372	sre_pp(E),
373	write(':'),
374	sre_pp(F),
375	!.
376sre_pp([A|T]) :-
377	sre_pp_l([A|T]),
378	!.
379sre_pp((A,B)) :-
380	write('('),
381	sre_pp(A),
382	write(','),
383	write(B),
384	write(')'),
385	!.
386sre_pp(X) :-
387	write(X).
388
389sre_pp_l([A]) :-
390	sre_pp(A),
391	!.
392sre_pp_l([A|T]) :-
393	write('['),
394	sre_pp(A),
395	write('+'),
396	sre_pp_l(T),
397	write(']'),
398	!.
399
400% ------------------------------------
401% solution dump: writes soln expression to a file, for input later.
402% Grammatical expression is written in multiple lines, since the full
403% expression is often larger than Prolog's builtin "write" can handle.
404
405/*
406write_soln(Run, E) :-
407	set_file_name("soln", Run, File),
408	tell(File),
409	write('soln('),
410	write_term(E),
411	write(').'),
412	nl,
413	told,
414	tell(user),
415	!.
416
417write_term(node(X,List,Y)) :-
418	!,
419	write('node('),
420	write(X),
421	write(',['),
422	write_tlist(List),
423	write('],'),
424	write(Y),
425	write(')').
426write_term(X) :- write(X).
427
428write_tlist([]) :- !.
429write_tlist([X,Y|Z]) :-
430	!,
431	write_term(X),
432	write(','),
433	nl,
434	write_tlist([Y|Z]).
435write_tlist([X]) :-
436	write_term(X).
437*/
```