:- ensure_loaded('$REGULUS/PrologLib/compatibility'). %--------------------------------------------------------------- :- module(manipulate_ngrams, [normalise_ngram_file/2, add_normalised_ngram_files/3, combine_normalised_ngram_files/3, order_combined_ngrams/2, order_combined_ngrams/3, load_combined_ngrams/1, normalised_frequencies_for_ngram/3, test_normalise_ngram_file/1, test_add_normalised_ngram_files/1, test_combine_normalised_ngram_files/1, test_order_combined_ngrams/1 ] ). %--------------------------------------------------------------- :- use_module('$REGULUS/PrologLib/utilities'). :- use_module(library(lists)). %--------------------------------------------------------------- test_normalise_ngram_file(forum) :- normalise_ngram_file('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_normalised.pl'). test_normalise_ngram_file(tm_fr) :- normalise_ngram_file('$ACCEPT/MT/GTFeb2012/TM/tm_fr_ngrams.pl', '$ACCEPT/MT/GTFeb2012/TM/tm_fr_ngrams_normalised.pl'). test_normalise_ngram_file(europarl_fr) :- normalise_ngram_file('$ACCEPT/MT/Europarl/Generated/europarl-v6-fr-ngrams.pl', '$ACCEPT/MT/Europarl/Generated/europarl-v6-fr-ngrams-normalised.pl'). test_add_normalised_ngram_files(tm_fr_europarl_fr) :- add_normalised_ngram_files('$ACCEPT/MT/GTFeb2012/TM/tm_fr_ngrams_normalised.pl', '$ACCEPT/MT/Europarl/Generated/europarl-v6-fr-ngrams-normalised.pl', '$ACCEPT/MT/Europarl/Generated/europarl-fr-plus-tm-fr-ngrams-normalised.pl'). test_add_normalised_ngram_files(tm_fr_europarl_fr_short) :- add_normalised_ngram_files1('$ACCEPT/MT/Europarl/Generated/europarl-fr-plus-tm-fr-ngrams-normalised.pl'). test_combine_normalised_ngram_files(forum_europarl_fr) :- combine_normalised_ngram_files('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_normalised.pl', '$ACCEPT/MT/Europarl/Generated/europarl-v6-fr-ngrams-normalised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_europarl.pl'). test_combine_normalised_ngram_files(forum_tm_fr) :- combine_normalised_ngram_files('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_normalised.pl', '$ACCEPT/MT/GTFeb2012/TM/tm_fr_ngrams_normalised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm.pl'). test_combine_normalised_ngram_files(forum_tm_fr_plus_europarl) :- combine_normalised_ngram_files('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_normalised.pl', %'$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_normalised_20.pl', '$ACCEPT/MT/Europarl/Generated/europarl-fr-plus-tm-fr-ngrams-normalised.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl'). test_combine_normalised_ngram_files(forum_europarl_fr_short) :- combine_normalised_ngram_files1('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_europarl.pl'). test_order_combined_ngrams(forum_europarl_fr) :- order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_europarl.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_europarl_ordered.pl'). test_order_combined_ngrams(forum_tm_fr) :- order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_ordered.pl'). test_order_combined_ngrams(forum_tm_fr_plus_europarl) :- order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered.pl'). test_order_combined_ngrams(forum_tm_fr_plus_europarl_split) :- read_ngram_file('$ACCEPT/MT/Europarl/Generated/europarl-fr-plus-tm-fr-ngrams-normalised.pl', ngrams2), test_order_combined_ngrams(forum_tm_fr_plus_europarl_split_main). test_order_combined_ngrams(forum_tm_fr_plus_europarl_split_main) :- compile('$ACCEPT/MT/GTFeb2012/Prolog/stopwords.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', unigram_and_alphabetic, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_unigrams.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', %bigram_and_alphabetic, bigram_and_alphabetic_no_stopwords, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', %trigram_and_alphabetic, %trigram_and_alphabetic_at_least_one_rare_bigram, %trigram_and_alphabetic_two_rare_bigrams, %trigram_and_alphabetic_no_rare_unigrams, %trigram_and_alphabetic_no_rare_unigrams_or_bigrams, %trigram_and_alphabetic_no_stopwords, %trigram_and_alphabetic_rare_unigrams_or_bigrams_no_stopwords, trigram_and_alphabetic_two_rare_bigrams_no_stopwords, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams.pl'). test_order_combined_ngrams(forum_tm_fr_plus_europarl_clitics) :- compile('$ACCEPT/MT/GTFeb2012/Prolog/stopwords.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', bigram_and_alphabetic_includes_clitic, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_clitics.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', trigram_and_alphabetic_includes_clitic, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams_clitics.pl'). test_order_combined_ngrams(forum_tm_fr_plus_europarl_plus_or_quoi) :- %compile('$ACCEPT/MT/GTFeb2012/Prolog/stopwords.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', bigram_and_alphabetic_includes_plus_or_quoi, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_plus_or_quoi.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', trigram_and_alphabetic_includes_plus_or_quoi, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams_plus_or_quoi.pl'). test_order_combined_ngrams(forum_tm_fr_plus_europarl_jamais) :- %compile('$ACCEPT/MT/GTFeb2012/Prolog/stopwords.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', bigram_and_alphabetic_includes_jamais, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_jamais.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', trigram_and_alphabetic_includes_jamais, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams_jamais.pl'). test_order_combined_ngrams(forum_tm_fr_plus_europarl_on) :- %compile('$ACCEPT/MT/GTFeb2012/Prolog/stopwords.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', bigram_and_alphabetic_includes_on, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_bigrams_on.pl'), order_combined_ngrams('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl.pl', trigram_and_alphabetic_includes_on, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams_versus_tm_plus_europarl_ordered_trigrams_on.pl'). %--------------------------------------------------------------- normalise_ngram_file(InFile, OutFile) :- safe_absolute_file_name(InFile, AbsInFile), safe_absolute_file_name(OutFile, AbsOutFile), read_ngram_file(AbsInFile, user), open(AbsOutFile, write, SOut, [encoding('UTF-8'), encoding_signature(true)]), write_normalised_ngrams(1, SOut, N1), write_normalised_ngrams(2, SOut, N2), write_normalised_ngrams(3, SOut, N3), close(SOut), N is N1 + N2 + N3, format('~N--- Written file (~d ngrams) ~w~n', [N, AbsOutFile]). %--------------------------------------------------------------- combine_normalised_ngram_files(InFile1, InFile2, OutFile) :- load_both_ngram_sets(InFile1, InFile2), combine_normalised_ngram_files1(OutFile). combine_normalised_ngram_files1(OutFile) :- safe_absolute_file_name(OutFile, AbsOutFile), open(AbsOutFile, write, SOut, [encoding('UTF-8'), encoding_signature(true)]), write_combined_ngrams(SOut, ngrams1, ngrams2), close(SOut), format('~N--- Written combined ngrams file ~w~n', [AbsOutFile]). load_both_ngram_sets(InFile1, InFile2) :- safe_absolute_file_name(InFile1, AbsInFile1), safe_absolute_file_name(InFile2, AbsInFile2), read_ngram_file(AbsInFile1, ngrams1), read_ngram_file(AbsInFile2, ngrams2), !. load_combined_ngrams(InFile) :- safe_absolute_file_name(InFile, AbsInFile), read_ngram_file(AbsInFile, combined_ngrams), !. normalised_frequencies_for_ngram(NGram, Score1, Score2) :- combined_ngrams:ngram(NGram, Score1, Score2), !. normalised_frequencies_for_ngram(_NGram, Score1, Score2) :- Score1 = 0.0, Score2 = 0.0, !. %--------------------------------------------------------------- add_normalised_ngram_files(InFile1, InFile2, OutFile) :- load_both_ngram_sets(InFile1, InFile2), add_normalised_ngram_files1(OutFile). add_normalised_ngram_files1(OutFile) :- safe_absolute_file_name(OutFile, AbsOutFile), open(AbsOutFile, write, SOut, [encoding('UTF-8'), encoding_signature(true)]), write_added_ngrams(SOut, ngrams1, ngrams2, N), close(SOut), format('~N--- Written added ngrams file (~d ngrams)~w~n', [N, AbsOutFile]). %--------------------------------------------------------------- order_combined_ngrams(InFile, OutFile) :- order_combined_ngrams(InFile, trivial, OutFile). order_combined_ngrams(InFile, FilterPred, OutFile) :- safe_absolute_file_name(InFile, AbsInFile), safe_absolute_file_name(OutFile, AbsOutFile), safe_prolog_file_to_list_printing_statistics(AbsInFile, InList, 'UTF-8'), format('~N--- Filtering elements using "~w"... ', [FilterPred]), filter_combined_ngrams(InList, FilterPred, NextList, 0-_NFiltered), format('done~n', []), format('~N--- Sorting elements... ', []), sort_combined_ngrams(NextList, OutList), format('done~n', []), length(OutList, N), list_to_prolog_file_with_encoding(OutList, AbsOutFile, 'UTF-8'), format('~N--- Written ordered ngrams comparison file (~d elements) ~w~n', [N, AbsOutFile]). filter_combined_ngrams([], _FilterPred, [], N-N). filter_combined_ngrams([F | R], FilterPred, Out, NIn-NOut) :- NNext is NIn + 1, ( 0 is NNext mod 1000 -> format('~d ', [NNext]), flush_output(user) ; otherwise -> true ), Call =.. [FilterPred, F], ( call(Call) -> Out = [F | R1] ; Out = R1 ), !, filter_combined_ngrams(R, FilterPred, R1, NNext-NOut). sort_combined_ngrams(InList, OutList) :- add_key_to_combined_ngrams(InList, KeyedInList), keysort(KeyedInList, SortedKeyedInList), store_high_scoring_unigrams_and_bigrams(SortedKeyedInList), remove_ngrams_with_high_scoring_sub_ngrams(SortedKeyedInList, SortedKeyedInList1), unkey_list(SortedKeyedInList1, OutList). add_key_to_combined_ngrams([], []). add_key_to_combined_ngrams([F | R], [F1 | R1]) :- add_key_to_combined_ngram(F, F1), !, add_key_to_combined_ngrams(R, R1). add_key_to_combined_ngram(ngram(Words, Count1, Count2), MinusDiffScore-ngram(Words, Count1, Count2, DiffScore)) :- ngram_count_diff_score(Words, Count1, Count2, DiffScore), MinusDiffScore is -1 * DiffScore, !. ngram_count_diff_score(Words, Count1, Count2, DiffScore) :- length(Words, Length), length_bonus(Length, LengthBonus), smooth_zero_score(Count2, SmoothedCount2), DiffScore is LengthBonus + (Count1 / SmoothedCount2), !. %length_bonus(1, 1000) :- % !. %length_bonus(2, 500) :- % !. length_bonus(_Other, 0) :- !. smooth_zero_score(Count, SmoothedCount) :- ( Count < 1.0 -> SmoothedCount = 1.0 ; otherwise -> SmoothedCount = Count ). %--------------------------------------------------------------- trivial(_AnyRecord). unigram(ngram([_], _Count1, _Count2)). bigram(ngram([_, _], _Count1, _Count2)). trigram(ngram([_, _, _], _Count1, _Count2)). unigram_and_alphabetic(ngram([A], _Count1, _Count2)) :- alphabetic_word_list([A]). bigram_and_alphabetic(ngram([A, B], _Count1, _Count2)) :- alphabetic_word_list([A, B]). trigram_and_alphabetic(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]). trigram_and_alphabetic_at_least_one_rare_bigram(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), at_least_one_rare_bigram([A, B, C]). trigram_and_alphabetic_two_rare_bigrams_no_stopwords(ngram([A, B, C], Count1, Count2)) :- trigram_and_alphabetic_two_rare_bigrams(ngram([A, B, C], Count1, Count2)), trigram_and_alphabetic_no_stopwords(ngram([A, B, C], Count1, Count2)). trigram_and_alphabetic_two_rare_bigrams(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), two_rare_bigrams([A, B, C]). trigram_and_alphabetic_no_rare_unigrams(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), no_rare_unigrams([A, B, C]). trigram_and_alphabetic_rare_unigrams_or_bigrams_no_stopwords(ngram([A, B, C], Count1, Count2)) :- trigram_and_alphabetic_no_rare_unigrams_or_bigrams(ngram([A, B, C], Count1, Count2)), trigram_and_alphabetic_no_stopwords(ngram([A, B, C], Count1, Count2)). trigram_and_alphabetic_no_rare_unigrams_or_bigrams(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), no_rare_unigrams([A, B, C]), no_rare_bigrams([A, B, C]). trigram_and_alphabetic_no_stopwords(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), \+ trigram_contains_stopword([A, B, C]). bigram_and_alphabetic_no_stopwords(ngram([A, B], _Count1, _Count2)) :- alphabetic_word_list([A, B]), \+ bigram_contains_stopword([A, B]). trigram_and_alphabetic_includes_clitic(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), trigram_contains_clitic([A, B, C]). bigram_and_alphabetic_includes_clitic(ngram([A, B], _Count1, _Count2)) :- alphabetic_word_list([A, B]), bigram_contains_clitic([A, B]). trigram_and_alphabetic_includes_plus_or_quoi(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), ngram_contains_plus_or_quoi([A, B, C]). bigram_and_alphabetic_includes_plus_or_quoi(ngram([A, B], _Count1, _Count2)) :- alphabetic_word_list([A, B]), ngram_contains_plus_or_quoi([A, B]). ngram_contains_plus_or_quoi(Ngram) :- member(plus, Ngram), !. ngram_contains_plus_or_quoi(Ngram) :- member(quoi, Ngram). trigram_and_alphabetic_includes_jamais(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), ngram_contains_jamais([A, B, C]). bigram_and_alphabetic_includes_jamais(ngram([A, B], _Count1, _Count2)) :- alphabetic_word_list([A, B]), ngram_contains_jamais([A, B]). ngram_contains_jamais(Ngram) :- member(jamais, Ngram), !. trigram_and_alphabetic_includes_on(ngram([A, B, C], _Count1, _Count2)) :- alphabetic_word_list([A, B, C]), ngram_contains_on([A, B, C]). bigram_and_alphabetic_includes_on(ngram([A, B], _Count1, _Count2)) :- alphabetic_word_list([A, B]), ngram_contains_on([A, B]). ngram_contains_on(Ngram) :- member(on, Ngram), !. at_least_one_rare_bigram([A, B, C]) :- ( rare_bigram([A, B]) ; rare_bigram([B, C]) ). two_rare_bigrams([A, B, C]) :- rare_bigram([A, B]), rare_bigram([B, C]). no_rare_bigrams([A, B, C]) :- \+ rare_bigram([A, B]), \+ rare_bigram([B, C]). no_rare_unigrams([A, B, C]) :- \+ rare_unigram(A), \+ rare_unigram(B), \+ rare_unigram(C). rare_unigram(A) :- ( ngrams2:ngram(A, [A], Count2A) -> Count2A < 5.0 ; otherwise -> true ). rare_bigram([A, B]) :- ( ngrams2:ngram(A, [A, B], Count2AB) -> Count2AB < 5.0 ; otherwise -> true ). alphabetic_word_list([]). alphabetic_word_list([F | R]) :- alphabetic_word(F), !, alphabetic_word_list(R). alphabetic_word(Atom) :- atom_codes(Atom, Str), alphabetic_str(Str). alphabetic_str([]). alphabetic_str([F | R]) :- alphabetic_char(F), !, alphabetic_str(R). alphabetic_char(F) :- ( lowercase_char(F) ; uppercase_char(F) ), !. %--------------------------------------------------------------- :- dynamic high_scoring_unigram/1. :- dynamic high_scoring_bigram/2. store_high_scoring_unigrams_and_bigrams(SortedKeyedInList) :- format('~N--- Storing high-scoring unigrams and bigrams... ', []), retractall(high_scoring_unigram(_)), retractall(high_scoring_bigram(_, _)), store_high_scoring_unigrams_and_bigrams1(SortedKeyedInList), format('done~n', []), !. store_high_scoring_unigrams_and_bigrams1([]). store_high_scoring_unigrams_and_bigrams1([F | R]) :- store_high_scoring_unigram_or_bigram(F), !, store_high_scoring_unigrams_and_bigrams1(R). store_high_scoring_unigram_or_bigram(_MinusDiffScore-ngram(Words, _Count1, _Count2, DiffScore)) :- store_high_scoring_unigram_or_bigram1(DiffScore, Words). store_high_scoring_unigram_or_bigram1(DiffScore, Words) :- length(Words, L), L < 3, high_score(DiffScore), !, store_high_scoring_unigram_or_bigram2(Words). store_high_scoring_unigram_or_bigram1(_DiffScore, _Words). high_score(Score) :- Score > 2000. store_high_scoring_unigram_or_bigram2([W1]) :- assertz(high_scoring_unigram(W1)). store_high_scoring_unigram_or_bigram2([W1, W2]) :- assertz(high_scoring_bigram(W1, W2)). %--------------------------------------------------------------- remove_ngrams_with_high_scoring_sub_ngrams([], []). remove_ngrams_with_high_scoring_sub_ngrams([F | R], R1) :- ngram_record_with_high_scoring_sub_ngrams(F), !, remove_ngrams_with_high_scoring_sub_ngrams(R, R1). remove_ngrams_with_high_scoring_sub_ngrams([F | R], [F | R1]) :- !, remove_ngrams_with_high_scoring_sub_ngrams(R, R1). ngram_record_with_high_scoring_sub_ngrams(_Score-ngram(Words, _Count1, _Count2, _DiffScore)) :- ngram_with_high_scoring_sub_ngrams(Words), !. ngram_with_high_scoring_sub_ngrams(Words) :- length(Words, L), L > 1, member(W, Words), high_scoring_unigram(W), !. ngram_with_high_scoring_sub_ngrams([A, B, C]) :- ( high_scoring_bigram(A, B) ; high_scoring_bigram(B, C) ), !. %--------------------------------------------------------------- write_combined_ngrams(SOut, Module1, Module2) :- find_all_ngrams(Module1, NGrams1), write_combined_ngrams(NGrams1, SOut, Module2, 0). find_all_ngrams(Module, NGrams) :- format('~N--- Collecting ngrams from module ~w~n', [Module]), findall(ngram(FirstWord, Words, Count), Module:ngram(FirstWord, Words, Count), NGrams), length(NGrams, N), format('~N--- Found ~d ngrams~n', [N]), !. write_combined_ngrams([], _SOut, _Module2, _I). write_combined_ngrams([F | R], SOut, Module2, I) :- F = ngram(FirstWord, Words, Count1), I1 is I + 1, ( 0 is I1 mod 1000 -> format('~d ', [I1]), flush_output(user) ; otherwise -> true ), ( Module2:ngram(FirstWord, Words, Count2) -> true ; otherwise -> Count2 = 0.0 ), format(SOut, '~Nngram(~q, ~2f, ~2f).~n', [Words, Count1, Count2]), !, write_combined_ngrams(R, SOut, Module2, I1). %--------------------------------------------------------------- write_added_ngrams(SOut, Module1, Module2, N) :- write_added_ngrams1(SOut, Module1, Module2, 0-N1), write_added_ngrams2(SOut, Module1, Module2, N1-N). % All Ngrams in Module1, combined with Module2 if necessary. write_added_ngrams1(SOut, Module1, Module2, NIn-NOut) :- findall(ngram(FirstWord, Words, Count), ( Module1:ngram(FirstWord, Words, Count1), ( Module2:ngram(FirstWord, Words, Count2) -> safe_max_list([Count1, Count2], Count) ; otherwise -> Count = Count1 ) ), List), write_ngrams(List, SOut, NIn-NOut). % All Ngrams in Module2 that don't occur in Module2 write_added_ngrams2(SOut, Module1, Module2, NIn-NOut) :- findall(ngram(FirstWord, Words, Count1), ( Module2:ngram(FirstWord, Words, Count1), \+ Module1:ngram(FirstWord, Words, _Count2) ), List), write_ngrams(List, SOut, NIn-NOut). write_ngrams([], _SOut, NIn-NIn). write_ngrams([F | R], SOut, NIn-NOut) :- write_ngram(F, SOut, NIn-NNext), !, write_ngrams(R, SOut, NNext-NOut). write_ngram(Ngram, SOut, NIn-NOut) :- format(SOut, '~N~q.~n', [Ngram]), NOut is NIn + 1, !. %--------------------------------------------------------------- read_ngram_file(File, Module) :- %safe_compile(Module, File), safe_compile_with_redefine_warnings_off(Module, File), format('~N--- Loaded ngrams from ~w into module ~w~n', [File, Module]). write_normalised_ngrams(Length, SOut, N) :- format('~N--- Finding N-grams of length ~d... ', [Length]), total_ngram_count(Length, user, List, N, Total), format('found (~d ngrams, ~d different). Writing out ngrams~n', [Total, N]), sort_ngrams(List, List1), write_normalised_ngrams1(List1, SOut, Total, 0). total_ngram_count(Length, Module, List, N, Total) :- findall(Ngram, ngram_of_length_n(Ngram, Module, Length), List), length(List, N), total_ngram_count1(List, 0-Total). ngram_of_length_n(ngram([X], Count), Module, 1) :- Module:ngram([X], Count). ngram_of_length_n(ngram([X, Y], Count), Module, 2) :- Module:ngram([X, Y], Count). ngram_of_length_n(ngram([X, Y, Z], Count), Module, 3) :- Module:ngram([X, Y, Z], Count). total_ngram_count1([], In-In). total_ngram_count1([ngram(_Words, Count) | R], In-Out) :- Next is In + Count, !, total_ngram_count1(R, Next-Out). sort_ngrams(List, List1) :- tag_ngrams(List, TaggedList), keysort(TaggedList, SortedTaggedList), unkey_list(SortedTaggedList, List1). tag_ngrams([], []). tag_ngrams([F | R], [F1 | R1]) :- tag_ngram(F, F1), !, tag_ngrams(R, R1). tag_ngram(ngram(Words, Count), Key-ngram(Words, Count)) :- Key is Count * -1. write_normalised_ngrams1([], _SOut, _Total, _I) :- !. write_normalised_ngrams1([F | R], SOut, Total, I) :- I1 is I + 1, ( 0 is I1 mod 10000 -> format('~d ', [I1]), flush_output(user) ; otherwise -> true ), write_normalised_ngram(F, SOut, Total), !, write_normalised_ngrams1(R, SOut, Total, I1). write_normalised_ngram(ngram(Words, Count), SOut, Total) :- Words = [FirstWord | _], NormalisedCount is (1000000 * Count) / Total, format(SOut, '~Nngram(~q, ~q, ~2f).~n', [FirstWord, Words, NormalisedCount]), !. write_normalised_ngram(Ngram, SOut, Total) :- format('~N*** Error: bad call: ~w~n', [write_normalised_ngram(Ngram, SOut, Total)]), fail.