:- ensure_loaded('$REGULUS/PrologLib/compatibility'). %--------------------------------------------------------------- :- module(extract_interesting_examples, [extract_interesting_examples/5, extract_interesting_examples/6, split_ngrams_example_file/6, split_ngrams_example_file/4, test_extract_interesting_examples/1, test_split_ngrams_example_file/1 ] ). %--------------------------------------------------------------- :- use_module('$REGULUS/PrologLib/CorpusTools/tokenize_sents'). :- use_module('$REGULUS/PrologLib/CorpusTools/utils'). :- use_module('$REGULUS/PrologLib/utilities'). :- use_module(library(random)). :- use_module(library(lists)). %--------------------------------------------------------------- test_extract_interesting_examples(forum_tm_fr_plus_europarl) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered.pl', 500, 5, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples.txt'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams.pl', 20, 3, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams.txt'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams.pl', 20, 3, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams_clitics_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_clitics.pl', 10, 5, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_clitics.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams_clitics_pl) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_clitics.pl', 5, 10000, pl, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_clitics.pl'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams_plus_or_quoi_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_plus_or_quoi.pl', %10, 0, 10000, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_plus_or_quoi.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams_plus_or_quoi_pl) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_plus_or_quoi.pl', %10, 0, 10000, pl, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_plus_or_quoi.pl'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_trigrams_plus_or_quoi_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams_plus_or_quoi.pl', %10, 0, 5, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_trigrams_plus_or_quoi.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams_jamais_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_jamais.pl', %10, 0, 5, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_jamais.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_trigrams_jamais_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams_jamais.pl', %10, 0, 5, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_trigrams_jamais.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_bigrams_on_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_on.pl', %10, 0, 5, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_on.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_trigrams) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams.pl', 10, 3, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_trigrams.txt'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_trigrams_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams.pl', 10, 3, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_trigrams.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_good_trigrams) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/good_trigrams.pl', 10, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_good_trigrams.txt'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_trigrams_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams.pl', 10, 2, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_trigrams.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_good_trigrams_csv) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/good_trigrams.pl', 10, csv, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_good_trigrams.csv'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_small) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised_small.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', 2500, 5, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_small.txt'). test_extract_interesting_examples(forum_tm_fr_plus_europarl_very_small) :- extract_interesting_examples('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised_very_small.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', 2500, 5, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_very_small.txt'). test_split_ngrams_example_file(bigrams_clitics) :- split_ngrams_example_file('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_clitics.pl', 3, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/bigrams_clitics_dev.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/bigrams_clitics_test.pl'). test_split_ngrams_example_file(plus_or_quoi) :- split_ngrams_example_file('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_examples_bigrams_plus_or_quoi.pl', 3, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/bigrams_plus_or_quoi_dev.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/bigrams_plus_or_quoi_test.pl'). test_split_ngrams_example_file(plus_or_quoi_potentially_interesting) :- split_ngrams_example_file('$ACCEPT/MT/GTFeb2012/AMTResults/plus_to_pas_v1_summary.pl', Record^( member(ngram_score=NgramScore, Record), member(freq2=Freq2, Record), ( NgramScore > 4.0 ; Freq2 < 0.001 ) ), '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/bigrams_plus_or_quoi_test.pl', 5, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/bigrams_plus_or_quoi_dev2.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/bigrams_plus_or_quoi_test2.pl'). %--------------------------------------------------------------- /* split_ngrams_example_file(AMTSummaryFile, ExtractionPred, InFile, NExamples, DevFile, TestFile) split_ngrams_example_file(InFile, NExamples, DevFile, TestFile) AMTSummaryFile is a Prolog-formatted UTF-8 file produced by crowdsource_translations:amt_translation_judgements_file_to_prolog/6 ExtractionPred is a predicate that identifies potentially relevant ngrams If the first two args are defined, only examples with potentially relevant ngrams will be used. InFile is a file of Prolog-formatted ngram examples with multiplicities. Typical example: ngram_example([vous,servez],5,'Si vous vous servez de Identity Safe, avant mise � jour, faire une sauvegarde des identifiants.'). DevFile takes at most NExamples examples from each group in InFile to make DevFile, a text file of examples. TestFile has the remaining examples, same format as InFile */ split_ngrams_example_file(InFile, NExamples, DevFile, TestFile) :- split_ngrams_example_file('*no_file*', '*no_extraction_pred*', InFile, NExamples, DevFile, TestFile). split_ngrams_example_file(AMTSummaryFile, ExtractionPred, InFile, NExamples, DevFile, TestFile) :- mark_potentially_relevant_ngrams(AMTSummaryFile, ExtractionPred), safe_absolute_file_name(InFile, AbsInFile), safe_absolute_file_name(DevFile, AbsDevFile), safe_absolute_file_name(TestFile, AbsTestFile), load_ngram_examples(AbsInFile), open(AbsDevFile, write, SDev, [encoding('UTF-8'), encoding_signature(true)]), open(AbsTestFile, write, STest, [encoding('UTF-8'), encoding_signature(true)]), find_all_ngram_examples(AllNgrams), split_ngrams_examples(AllNgrams, NExamples, SDev, STest), close(SDev), close(STest), format('~NWritten dev examples to ~w~n', [AbsDevFile]), format('~NWritten test examples to ~w~n', [AbsTestFile]). %--------------------------------------------------------------- :- dynamic potentially_relevant_ngram/1. potentially_relevant_ngrams_are_defined :- potentially_relevant_ngram(_). mark_potentially_relevant_ngrams(AMTSummaryFile, ExtractionPred) :- retractall(potentially_relevant_ngram(_)), safe_absolute_file_name(AMTSummaryFile, AbsAMTSummaryFile), read_amt_summary_file(AbsAMTSummaryFile, List), mark_potentially_relevant_ngrams1(List, ExtractionPred, 0-N), format('~N--- Marked ~d potentially relevant ngrams from ~w~n', [N, AbsAMTSummaryFile]), !. mark_potentially_relevant_ngrams(AMTSummaryFile, ExtractionPred) :- format('~N*** Error: bad call: ~w~n', [mark_potentially_relevant_ngrams(AMTSummaryFile, ExtractionPred)]), fail. read_amt_summary_file(File, List) :- safe_prolog_file_to_list(File, List, 'UTF-8'), !. mark_potentially_relevant_ngrams1([], _ExtractionPred, N-N). mark_potentially_relevant_ngrams1([F | R], ExtractionPred, In-Out) :- mark_potentially_relevant_ngrams_in_record(F, ExtractionPred, In-Next), !, mark_potentially_relevant_ngrams1(R, ExtractionPred, Next-Out). mark_potentially_relevant_ngrams_in_record(Record, ExtractionPred, In-Out) :- member(ngram=Ngram, Record), ExtractionPred = X^Body, copy_term(X^Body, X1^Body1), X1 = Record, Call = Body1, ( call(Call) -> mark_as_potentially_relevant_ngram(Ngram, In-Out) ; otherwise -> Out = In ), !. mark_potentially_relevant_ngrams_in_record(Record, ExtractionPred, InOut) :- format('~N*** Error: bad call: ~w~n', [mark_potentially_relevant_ngrams_in_record(Record, ExtractionPred, InOut)]), fail. mark_as_potentially_relevant_ngram(Ngram, In-Out) :- ( potentially_relevant_ngram(Ngram) -> Out = In ; otherwise -> assertz(potentially_relevant_ngram(Ngram)), format('~N--- Potentially relevant ngram: ~w~n', [Ngram]), Out is In + 1 ). %--------------------------------------------------------------- load_ngram_examples(AbsInFile) :- format('~NLoading ngram examples from ~w~n', [AbsInFile]), safe_compile(ngram_examples, AbsInFile). find_all_ngram_examples(AllNgrams) :- ( potentially_relevant_ngrams_are_defined -> findall(NGram, ( potentially_relevant_ngram(NGram), ngram_examples:ngram_example(NGram, _Multiplicity, _Example) ), AllNgrams0) ; otherwise -> findall(NGram, ngram_examples:ngram_example(NGram, _Multiplicity, _Example), AllNgrams0) ), sort(AllNgrams0, AllNgrams), length(AllNgrams, N), format('~N--- Found examples for ~d ngrams~n', [N]). split_ngrams_examples([], _NExamples, _SDev, _STest). split_ngrams_examples([F | R], NExamples, SDev, STest) :- split_examples_for_single_ngram(F, NExamples, SDev, STest), !, split_ngrams_examples(R, NExamples, SDev, STest). split_examples_for_single_ngram(NGram, NExamples, SDev, STest) :- all_records_for_ngram(NGram, Records), split_examples_for_single_ngram1(Records, NGram, NExamples, SDev, STest). all_records_for_ngram(NGram, Records) :- findall(ngram_example(NGram, Multiplicity, Example), ngram_examples:ngram_example(NGram, Multiplicity, Example), Records0), random_permutation(Records0, Records). % Reached end. Warn if we were still looking for examples split_examples_for_single_ngram1([], NGram, NExamples, _SDev, _STest) :- ( NExamples > 0 -> format('~N--- Warning: missing ~d examples for ~w~n', [NExamples, NGram]) ; otherwise -> true ). % We have found all our Dev examples. Write everything out to Test split_examples_for_single_ngram1([F | R], NGram, NExamples, SDev, STest) :- NExamples =< 0, format(STest, '~N~q.~n', [F]), !, split_examples_for_single_ngram1(R, NGram, 0, SDev, STest). split_examples_for_single_ngram1([F | R], NGram, NExamples, SDev, STest) :- NExamples > 0, F = ngram_example(NGram, Multiplicity, Example), format(SDev, '~N~q.~n', [ngram_example(NGram, 1, Example)]), Multiplicity1 is Multiplicity - 1, ( Multiplicity1 > 0 -> format(STest, '~N~q.~n', [ngram_example(NGram, Multiplicity1, Example)]) ; otherwise -> true ), NExamples1 is NExamples - 1, !, split_examples_for_single_ngram1(R, NGram, NExamples1, SDev, STest). %--------------------------------------------------------------- extract_interesting_examples(TokenizedFile, ComparisonFile, Cutoff, MaxExamples, OutFile) :- extract_interesting_examples(TokenizedFile, ComparisonFile, Cutoff, MaxExamples, text, OutFile). extract_interesting_examples(TokenizedFile, ComparisonFile, Cutoff, MaxExamples, Format, OutFile) :- check_format(Format), absolute_file_name(TokenizedFile, AbsTokenizedFile), absolute_file_name(ComparisonFile, AbsComparisonFile), absolute_file_name(OutFile, AbsOutFile), store_interesting_ngrams(AbsComparisonFile, Cutoff), store_interesting_examples(AbsTokenizedFile, MaxExamples), write_out_interesting_examples(AbsOutFile, Format). check_format(text) :- !. check_format(csv) :- !. check_format(pl) :- !. check_format(Other) :- format('~N*** Error in extract_interesting_examples/6: format arg "~w" must be "text", "csv" or "pl"~n', [Other]), fail. %--------------------------------------------------------------- :- dynamic interesting_ngram/5. store_interesting_ngrams(AbsComparisonFile, Cutoff) :- integer(Cutoff), !, retractall(interesting_ngram(_, _, _, _, _)), open(AbsComparisonFile, read, SIn, [encoding('UTF-8')]), store_interesting_ngrams(SIn, Cutoff, 0-N), close(SIn), format('~N--- Stored ~d potentially interesting ngrams from ~w~n', [N, AbsComparisonFile]). store_interesting_ngrams(_AbsComparisonFile, InterestingNgramFile) :- retractall(interesting_ngram(_, _, _, _, _)), safe_absolute_file_name(InterestingNgramFile, AbsInterestingNgramFile), safe_prolog_file_to_list_printing_statistics(AbsInterestingNgramFile, InterestingNGrams), length(InterestingNGrams, N), store_interesting_ngrams_from_list(InterestingNGrams), format('~N--- Stored ~d potentially interesting ngrams from ~w~n', [N, AbsInterestingNgramFile]). store_interesting_ngrams(SIn, Cutoff, NIn-NOut) :- read(SIn, Term), !, store_interesting_ngrams1(Term, SIn, Cutoff, NIn-NOut). store_interesting_ngrams1(end_of_file, _SIn, _Cutoff, NIn-NIn) :- !. store_interesting_ngrams1(Term, _SIn, Cutoff, NIn-NIn) :- Term = ngram(_Words, _Count1, _Count2, Score), Score < Cutoff, !. store_interesting_ngrams1(Term, SIn, Cutoff, NIn-NOut) :- Term = ngram(Words, Count1, Count2, Score), Words = [FirstWord | _], ( Score > Cutoff -> assertz(interesting_ngram(FirstWord, Words, Count1, Count2, Score)), NNext is NIn + 1 ; otherwise -> NNext = NIn ), !, store_interesting_ngrams(SIn, Cutoff, NNext-NOut). store_interesting_ngrams_from_list([]). store_interesting_ngrams_from_list([F | R]) :- store_interesting_ngram_from_list(F), !, store_interesting_ngrams_from_list(R). store_interesting_ngram_from_list(interesting_ngram(Words)) :- Words = [FirstWord | _Rest], assertz(interesting_ngram(FirstWord, Words, 0.0, 0.0, 0.0)), !. store_interesting_ngram_from_list(F) :- format('~N*** Error: bad call: ~w~n', [store_interesting_ngram_from_list(F)]), fail. %--------------------------------------------------------------- :- dynamic interesting_example/2. :- dynamic number_of_occurrences_of_example/2. store_interesting_examples(AbsTokenizedFile, MaxExamples) :- format('~N--- Looking for potentially interesting examples in ~w~n', [AbsTokenizedFile]), retractall(interesting_example(_, _)), retractall(number_of_occurrences_of_example(_, _)), open(AbsTokenizedFile, read, SIn, [encoding('UTF-8')]), store_interesting_examples(SIn, MaxExamples, 0-N, 0), close(SIn), format('~N--- Stored ~d potentially interesting examples from ~w~n', [N, AbsTokenizedFile]). store_interesting_examples(SIn, MaxExamples, NIn-NOut, I) :- read(SIn, Term), !, store_interesting_examples1(Term, SIn, MaxExamples, NIn-NOut, I). store_interesting_examples1(end_of_file, _SIn, _MaxExamples, NIn-NIn, _I) :- !. store_interesting_examples1(Term, SIn, MaxExamples, NIn-NOut, I) :- I1 is I + 1, ( 0 is I1 mod 1000 -> format('~d ', [I1]), flush_output(user) ; otherwise -> true ), store_interesting_examples2(Term, MaxExamples, NIn-NNext), !, store_interesting_examples(SIn, MaxExamples, NNext-NOut, I1). store_interesting_examples2(TokenizedSent, MaxExamples, NIn-NOut) :- make_tokenized_sent_canonical(TokenizedSent, TokenizedSent1), tokenized_sent_to_atom(TokenizedSent, TokenizedSentAtom), %join_with_spaces(TokenizedSent, TokenizedSentAtom), store_interesting_examples3(TokenizedSent1, TokenizedSentAtom, MaxExamples, NIn-NOut), !. store_interesting_examples2(TokenizedSent, MaxExamples, N) :- format('~N*** Error: bad call "~w"~n', [store_interesting_examples2(TokenizedSent, MaxExamples, N)]), fail. store_interesting_examples3([], _TokenizedSentAtom, _MaxExamples, NIn-NIn). store_interesting_examples3([F | R], TokenizedSentAtom, MaxExamples, NIn-NOut) :- ( interesting_unigram_prefix(UniGram, [F | R]) -> store_interesting_example(TokenizedSentAtom, UniGram, MaxExamples, NIn-NNext1) ; otherwise -> NNext1 = NIn ), ( interesting_bigram_prefix(BiGram, [F | R]) -> store_interesting_example(TokenizedSentAtom, BiGram, MaxExamples, NNext1-NNext2) ; otherwise -> NNext2 = NNext1 ), ( interesting_trigram_prefix(TriGram, [F | R]) -> store_interesting_example(TokenizedSentAtom, TriGram, MaxExamples, NNext2-NNext3) ; otherwise -> NNext3 = NNext2 ), !, store_interesting_examples3(R, TokenizedSentAtom, MaxExamples, NNext3-NOut). interesting_unigram_prefix([F], [F | _R]) :- interesting_ngram(F, [F], _Count1, _Count2, _Score), !. interesting_bigram_prefix([F, F1], [F, F1 | _R]) :- interesting_ngram(F, [F, F1], _Count1, _Count2, _Score), !. interesting_trigram_prefix([F, F1, F2], [F, F1, F2 | _R]) :- interesting_ngram(F, [F, F1, F2], _Count1, _Count2, _Score), !. store_interesting_example(TokenizedSentAtom, NGram, MaxExamples, NIn-NOut) :- inc_number_of_occurrences_of_example(TokenizedSentAtom), number_of_examples_for_ngram(NGram, NExamples), ( NExamples < MaxExamples, \+ interesting_example(_AnyNGram, TokenizedSentAtom), \+ bad_chars_in_atom(TokenizedSentAtom) ), assertz(interesting_example(NGram, TokenizedSentAtom)), NOut is NIn + 1, !. store_interesting_example(_TokenizedSent, _NGram, _MaxExamples, NIn-NIn). number_of_examples_for_ngram(NGram, NExamples) :- findall(x, interesting_example(NGram, _), Xs), length(Xs, NExamples), !. inc_number_of_occurrences_of_example(Example) :- ( number_of_occurrences_of_example(Example, N) -> retractall(number_of_occurrences_of_example(Example, _)) ; N = 0 ), N1 is N + 1, assertz(number_of_occurrences_of_example(Example, N1)), !. make_tokenized_sent_canonical(TokenizedSentIn, TokenizedSentOut) :- lowercase_atom_list(TokenizedSentIn, TokenizedSentNext), remove_spaces_and_punctuation(TokenizedSentNext, TokenizedSentOut), !. make_tokenized_sent_canonical(TokenizedSentIn, TokenizedSentOut) :- format('~N*** Error: bad call "~w"~n', [make_tokenized_sent_canonical(TokenizedSentIn, TokenizedSentOut)]), fail. bad_chars_in_atom(Atom) :- atom_codes(Atom, Codes), bad_chars_in_str(Codes). bad_chars_in_str([F | _R]) :- bad_code(F), !. bad_chars_in_str([_F | R]) :- bad_chars_in_str(R). bad_code(0'"). %" %--------------------------------------------------------------- write_out_interesting_examples(AbsOutFile, Format) :- open(AbsOutFile, write, SOut, [encoding('UTF-8'), encoding_signature(true)]), all_interesting_ngrams(Ngrams), write_out_interesting_examples(Ngrams, Format, SOut, 0-N), close(SOut), format('~N--- Written ~d potentially interesting examples to ~w~n', [N, AbsOutFile]). write_out_interesting_examples([], _Format, _SOut, N-N) :- !. write_out_interesting_examples([F | R], Format, SOut, NIn-NOut) :- write_out_interesting_examples_for_ngram(F, Format, SOut, NIn-NNext), !, write_out_interesting_examples(R, Format, SOut, NNext-NOut). write_out_interesting_examples_for_ngram(NGram, Format, SOut, NIn-NOut) :- NGram = interesting_ngram(_FirstWord, Words, Count1, Count2, _Score), ( Format = text -> format(SOut, '~N~n%NGram: ~w; #occurrences in first set: ~d; #occurrences in second set: ~d~n', [Words, Count1, Count2]), NGramAtom = '' ; Format = csv -> format_to_atom('~w ~d ~d', [Words, Count1, Count2], NGramAtom) ; otherwise -> NGramAtom = Words ), all_examples_for_ngram(Words, Examples), ( Examples = [] -> NIn = NOut, format('~N--- Warning: no examples for ~w~n', [Words]) ; otherwise -> write_out_interesting_examples_for_ngram1(Examples, Format, NGramAtom, SOut, NIn-NOut) ). write_out_interesting_examples_for_ngram1([], _Format, _NGramAtom, _SOut, NIn-NIn). write_out_interesting_examples_for_ngram1([F | R], Format, NGramAtom, SOut, NIn-NOut) :- write_out_interesting_example(F, Format, NGramAtom, SOut), NNext is NIn + 1, !, write_out_interesting_examples_for_ngram1(R, Format, NGramAtom, SOut, NNext-NOut). write_out_interesting_example(F, text, _NGramAtom, SOut) :- format(SOut, '~N~w~n', [F]). write_out_interesting_example(F, csv, NGramAtom, SOut) :- format(SOut, '~N"~w";"~w"~n', [NGramAtom, F]). write_out_interesting_example(F, pl, NGram, SOut) :- ( number_of_occurrences_of_example(F, N) -> true ; N = 0 ), format(SOut, '~N~q.~n', [ngram_example(NGram, N, F)]). write_out_interesting_example(F, Format, NGramAtom, SOut) :- format('~N*** Error: bad call "~w"~n', [write_out_interesting_example(F, Format, NGramAtom, SOut)]), fail. all_interesting_ngrams(Ngrams) :- findall(interesting_ngram(FirstWord, Words, Count1, Count2, Score), interesting_ngram(FirstWord, Words, Count1, Count2, Score), Ngrams). all_examples_for_ngram(NGram, Examples) :- findall(Example, interesting_example(NGram, Example), Examples).