:- ensure_loaded('$REGULUS/PrologLib/compatibility'). :- module(random_generate_from_pcfg, [test/1, parse_pcfg_file/1, random_generate/5 ]). :- use_module('$REGULUS/Prolog/regulus_utilities'). :- use_module('$REGULUS/PrologLib/utilities'). :- use_module(library(lists)). :- use_module(library(system)). 'SICSTUS4ONLY'( ( :- use_module(library(file_systems)) ) ). :- use_module(library(random)). %------------------------------------------------------------------------------------ test(parse_taxi) :- parse_pcfg_file('$TAXI_DRIVER/Generated/pcfg_trained/recogniser.grammar'). test(parse_medslt) :- parse_pcfg_file('$MED_SLT2/Eng/GeneratedFiles/pcfg_trained/recogniser.grammar'). test(generate_taxi) :- random_generate('$TAXI_DRIVER/Generated/pcfg_trained/recogniser.grammar', '.MAIN', 10000, [], '$TAXI_DRIVER/corpora/random_sents.txt'). test(generate_medslt) :- random_generate('$MED_SLT2/Eng/GeneratedFiles/pcfg_trained/recogniser.grammar', '.MAIN', 1000, [dont_expand='POLITENESS_PRE_NONE', dont_expand='UTTERANCE_ELLIPSIS_N'], '$MED_SLT2/corpora/headache_random_pcfg.txt'). %------------------------------------------------------------------------------------ random_generate(GrammarFile, StartSymbol, N, OtherArgs, OutputFile) :- set_random_generator_state_from_time, absolute_file_name(OutputFile, AbsOutputFile), parse_pcfg_file(GrammarFile), format('~N~n--- Start generating from ~w~n', [StartSymbol]), open(AbsOutputFile, write, S), random_generate_to_stream(StartSymbol, OtherArgs, S, 0-N), close(S), format('~N~n--- Finished, generated ~d examples~n', [N]), !. random_generate(GrammarFile, StartSymbol, N, OtherArgs, OutputFile) :- format2error('~N*** Error: bad call: ~w~n', [random_generate(GrammarFile, StartSymbol, N, OtherArgs, OutputFile)]), fail. %------------------------------------------------------------------------------------ random_generate_to_stream(_StartSymbol, _OtherArgs, _S, I-N) :- I >= N, !. random_generate_to_stream(StartSymbol, OtherArgs, S, I-N) :- random_generate(non_terminal(StartSymbol), OtherArgs, Words-[]), join_with_spaces(Words, Atom), format(S, '~N~w~n', [Atom]), I1 is I + 1, ( 0 is I1 mod 1000 -> format('~d ', [I1]), flush_output(user) ; true ), !, random_generate_to_stream(StartSymbol, OtherArgs, S, I1-N). %------------------------------------------------------------------------------------ random_generate(terminal(Word), _OtherArgs, WordsIn-WordsOut) :- WordsIn = [Word | WordsOut], !. random_generate(non_terminal(Symbol), OtherArgs, WordsIn-WordsOut) :- member(dont_expand=Symbol, OtherArgs), WordsIn = [Symbol | WordsOut], !. random_generate(non_terminal(Symbol), OtherArgs, WordsIn-WordsOut) :- pcfg_rule(Symbol, Rules), random(Index), choose_rule(0, Index, Rules, ChosenRuleBody), !, random_generate_body(ChosenRuleBody, OtherArgs, WordsIn-WordsOut). random_generate_body([], _OtherArgs, WordsIn-WordsIn) :- !. random_generate_body([F | R], OtherArgs, WordsIn-WordsOut) :- random_generate(F, OtherArgs, WordsIn-WordsNext), !, random_generate_body(R, OtherArgs, WordsNext-WordsOut). choose_rule(TotalIn, Index, [rule(Prob, Rule) | R], ChosenRuleBody) :- TotalNext is TotalIn + Prob, ( TotalNext >= Index -> ChosenRuleBody = Rule ; otherwise -> choose_rule(TotalNext, Index, R, ChosenRuleBody) ). %------------------------------------------------------------------------------------ :- dynamic pcfg_rule/2. parse_pcfg_file(GrammarFile) :- safe_absolute_file_name(GrammarFile, AbsGrammarFile), retractall(pcfg_rule(_, _)), open(AbsGrammarFile, read, S), format('~N~n--- Parsing PCFG grammar from ~w~n', [AbsGrammarFile]), parse_pcfg_file1(S, waiting_for_rule, 0-N), close(S), format('~N~n--- Parsed grammar, ~d non-terminals~n', [N]), !. parse_pcfg_file1(S, InState, InN-OutN) :- read_and_parse_line(S, ParsedLine), process_parsed_line(ParsedLine, InState-NextState, InN-NextN), ( NextN > InN, 0 is NextN mod 100 -> format('~d ', [NextN]), flush_output(user) ; true ), !, parse_pcfg_file2(ParsedLine, S, NextState, NextN-OutN). parse_pcfg_file2(end_of_file, _S, _InState, InN-InN) :- !. parse_pcfg_file2(_ParsedLine, S, InState, InN-OutN) :- parse_pcfg_file1(S, InState, InN-OutN). read_and_parse_line(S, ParsedLine) :- read_line(S, Line), parse_line(Line, ParsedLine). parse_line(end_of_file, end_of_file) :- !. parse_line(Line, ParsedLine) :- line(ParsedLine, Line, []), !. parse_line(Line, _ParsedLine) :- format2error('~N*** Error: unable to parse line "~s"~n', [Line]), fail. /* Typical rule: NP_N_N_SUBJ_Y_NULL_NULL_NORMAL_N_N_N_N_N_N_N_N_N_N_N_N_N_N_DEGREE_SYMPTOM_NONE_3_SING [ ( the N_ANY_N_N_N_N_N_N_N_N_N_N_N_N_N_DEF_DEGREE_ANY_ANY_SYMPTOM_NONE_3_SING:v_0 )~0.766666666667 {return( $v_0 )} ( POSSESSIVE:v_0 N_ANY_N_N_N_N_N_N_N_N_N_N_N_N_N_DEF_DEGREE_ANY_ANY_SYMPTOM_NONE_3_SING:v_1 )~0.116666666667 {return( concat( $v_0 $v_1 ) )} ( D_N_ANY_ANY_N_DEF_3_SING:v_0 N_ANY_N_N_N_N_N_N_N_N_N_N_N_N_N_DEF_DEGREE_ANY_ANY_SYMPTOM_NONE_3_SING:v_1 )~0.116666666667 {return( concat( ( ( spec $v_0 ) ) $v_1 ) )} ] */ %------------------------------------------------------------------------------------ % process_parsed_line(ParsedLine, InState-OutState, InN-OutN) process_parsed_line(empty, InState-InState, InN-InN) :- !. process_parsed_line(non_terminal(NonTerminal), waiting_for_rule-started_rule(NonTerminal), InN-InN) :- !. process_parsed_line(left_square_bracket, started_rule(NonTerminal)-filling_rule(NonTerminal, []), InN-InN) :- !. process_parsed_line(right_square_bracket, filling_rule(NonTerminal, Rules)-waiting_for_rule, InN-OutN) :- reverse(Rules, ReversedRules), assertz(pcfg_rule(NonTerminal, ReversedRules)), OutN is InN + 1, !. process_parsed_line(semantic_return_value, InState-InState, InN-InN) :- !. process_parsed_line(rule(Prob, Body), filling_rule(NonTerminal, InRules)-filling_rule(NonTerminal, [rule(Prob, Body) | InRules]), InN-InN) :- !. process_parsed_line(end_of_file, InState-InState, InN-InN) :- !. process_parsed_line(ParsedLine, State, N) :- format2error('~N*** Error: bad call: ~w~n', [process_parsed_line(ParsedLine, State, N)]), fail. %------------------------------------------------------------------------------------ line(empty) --> whitespace_or_empty. line(non_terminal(NonTerminal)) --> non_terminal(NonTerminal). line(left_square_bracket) --> "[", whitespace_or_empty. line(right_square_bracket) --> "]", whitespace_or_empty. line(semantic_return_value) --> whitespace_or_empty, "{", anything. line(rule(Prob, Body)) --> "(", whitespace_or_empty, body(Body), whitespace_or_empty, ")~", probability_value(Prob). body([F | R]) --> body_element(F), whitespace_or_empty, !, body(R). body([]) --> whitespace_or_empty. body_element(non_terminal(NonTerminal)) --> non_terminal(NonTerminal), optional_associated_var, !. body_element(terminal(Terminal)) --> terminal(Terminal). probability_value(DecimalNumber) --> "0.", digit_sequence(DigitsAfterPoint), {safe_number_codes(DecimalNumber, [0'0, 0'. | DigitsAfterPoint])}, !. probability_value(1) --> "1", !. non_terminal(NonTerminal) --> ".", nuance_uppercase_char(F), non_special_char_sequence(R), !, {atom_codes(NonTerminal, [0'., F | R])}. non_terminal(NonTerminal) --> nuance_uppercase_char(F), non_special_char_sequence(R), !, {atom_codes(NonTerminal, [F | R])}. terminal(Terminal) --> nuance_lowercase_char(F), non_special_char_sequence(R), !, {atom_codes(Terminal, [F | R])}. optional_associated_var --> ":", non_special_char_sequence(_), whitespace_or_empty, !. optional_associated_var --> whitespace_or_empty. non_special_char_sequence([F | R]) --> non_special_char(F), !, non_special_char_sequence(R). non_special_char_sequence([]) --> []. digit_sequence([F | R]) --> digit_char(F), !, digit_sequence(R). digit_sequence([]) --> []. whitespace_or_empty --> whitespace_char(_), !, whitespace_or_empty. whitespace_or_empty --> []. anything --> [_Any], !, anything. anything --> []. whitespace_char(Char) --> [Char], {whitespace_char(Char)}. uppercase_char(Char) --> [Char], {uppercase_char(Char)}. nuance_uppercase_char(Char) --> [Char], {nuance_uppercase_char(Char)}. nuance_lowercase_char(0'\') --> [0'\']. nuance_lowercase_char(Char) --> [Char], %{lowercase_char(Char)}. {nuance_lowercase_char(Char)}. nuance_uppercase_char(Char) :- uppercase_char(Char), \+ accented_char(Char). digit_char(Char) --> [Char], {digit_char(Char)}. non_special_char(Char) --> [Char], {non_special_char(Char)}. nuance_lowercase_char(Char) :- \+ uppercase_char(Char), \+ special_char(Char). nuance_lowercase_char(Char) :- accented_char(Char), \+ special_char(Char). non_uppercase_non_special_char(Char) :- \+ uppercase_char(Char), \+ special_char(Char). non_special_char(Char) :- \+ special_char(Char). special_char(0' ). special_char(0'\n). special_char(0'\t). special_char(0':). special_char(0'~). special_char(0'(). special_char(0')). special_char(0'[). special_char(0']). special_char(0'{). special_char(0'}). special_char(0'$). special_char(0'~).