:- ensure_loaded('$REGULUS/PrologLib/compatibility').

%---------------------------------------------------------------

:- module(tokenize_sents,
	  [tokenize_sent/2,
	   tokenize_sents_in_file/2,
	   tokenized_sent_to_atom/2,
	   load_multiwords/1,
	   remove_start_and_end_markers/2,
	   test_tokenize_sents/1
	   ]
      ).

%---------------------------------------------------------------

:- use_module('$REGULUS/PrologLib/utilities').

:- use_module(library(lists)).

%---------------------------------------------------------------

test_tokenize_sents(0) :-
	Sent = 'Je suis d�sol�e, Monsieur H�nsch et Monsieur Cox, je n\'avais pas vu que vous demandiez la parole.',
	tokenize_sent(Sent, TokenizedSent),
	format('~NIn  : ~w~n', [Sent]),
	format('~NOut : ~w~n', [TokenizedSent]),
	tokenized_sent_to_atom(TokenizedSent, Sent1),
	format('~NBack: ~w~n', [Sent1]),
	!.

test_tokenize_sents(small) :-
	tokenize_sents_in_file('$ACCEPT/MT/Europarl/Generated/europarl_ez_filtered_small.txt',
			       '$ACCEPT/MT/Europarl/Generated/europarl_ez_tokenized_small.pl').

test_tokenize_sents(full) :-
	tokenize_sents_in_file('$ACCEPT/MT/Europarl/Generated/europarl_ez_filtered.txt',
			       '$ACCEPT/MT/Europarl/Generated/europarl_ez_tokenized.txt').

test_tokenize_sents(forum) :-
	tokenize_sents_in_file('$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/all_files_v2.txt',
			       '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_tokenised.pl').

test_tokenize_sents(tm_fr) :-
	tokenize_sents_in_file('$ACCEPT/MT/GTFeb2012/TM/symc_bip_07_en_fr.fr',
			       '$ACCEPT/MT/GTFeb2012/TM/tm_fr_tokenised.pl').

test_tokenize_sents(all_europarl_fr) :-
	tokenize_sents_in_file('$ACCEPT/MT/Europarl/Data/europarl-v6.fr-en.fr',
			       '$ACCEPT/MT/Europarl/Generated/europarl-v6-fr-tokenized.pl').

test_tokenize_sents(translated_fr_forum) :-
	tokenize_sents_in_file('$ACCEPT/MT/PostEdition/Data/SymantecData/train.fr.translate',
			       '$ACCEPT/MT/PostEdition/Data/SymantecData/translated_fr_tokenized.pl').
	
%---------------------------------------------------------------

load_multiwords(File) :-
	safe_absolute_file_name(File, AbsFile),
	safe_compile(multiwords, AbsFile),
	format('~N--- Loaded multiwords file ~w~n', [AbsFile]),
	!.

multiwords_are_defined :-
	current_predicate(multiwords:multiword/2).

%---------------------------------------------------------------
	
tokenize_sents_in_file(InFile, OutFile) :-
	absolute_file_name(InFile, AbsInFile),
	absolute_file_name(OutFile, AbsOutFile),

	open(AbsInFile, read, SIn, [encoding('UTF-8')]),
	open(AbsOutFile, write, SOut, [encoding('UTF-8')]),

	tokenize_sents_in_stream(SIn, SOut, 0-NIn, 0-NOut),

	close(SIn),
	close(SOut),

	%list_to_prolog_file_with_encoding(OutList, OutFile, 'UTF-8'),
	format('~N--- Read file (~d lines) ~w~n', [NIn, AbsInFile]),
	format('~N--- Written file (~d lines) ~w~n', [NOut, AbsOutFile]),
	!.

tokenize_sents_in_stream(SIn, SOut, InI-InO, OutI-OutO) :-
	read_line(SIn, Line),
	!,
	tokenize_sents_in_stream1(Line, SIn, SOut, InI-InO, OutI-OutO).

tokenize_sents_in_stream1(Line, _SIn, _SOut, InI-InI, OutI-OutI) :-
	Line = end_of_file,
	!.
tokenize_sents_in_stream1(Line, SIn, SOut, InI-InO, OutI-OutO) :-
	tokenize_and_write_line(Line, SOut, InI-InNext, OutI-OutNext),
	!,
	tokenize_sents_in_stream(SIn, SOut, InNext-InO, OutNext-OutO).

tokenize_and_write_line(Line, SOut, I-I1, O-O1) :-
	tokenize_sent(Line, Tokenized),
	(   member('BAD'(Rest), Tokenized) ->
	    format('~N*** Warning: unable to tokenize "~w"~n', [Rest]),
	    O1 is O
	;
	    otherwise ->
	    format(SOut, '~N~q.~n', [Tokenized]),
	    O1 is O + 1
	),
	I1 is I + 1,
	(   0 is I1 mod 1000 ->
	    format('~d ', [I1]),
	    flush_output(user)
	;
	    otherwise ->
	    true
	),
	!.

%---------------------------------------------------------------
	
tokenize_sent(Str, TokenizedSent) :-
	tokenized_sent(TokenizedSent0, Str, []),
	(   multiwords_are_defined ->
	    collapse_multiwords(TokenizedSent0, TokenizedSent1)
	;
	    otherwise ->
	    TokenizedSent0 = TokenizedSent1
	),
	add_start_and_end_markers(TokenizedSent1, TokenizedSent),
	!.

tokenized_sent_to_atom(TokenizedSent0, Atom) :-
	remove_start_and_end_markers(TokenizedSent0, TokenizedSent),
	tokenized_sent_to_string(TokenizedSent, Str-[]),
	atom_codes(Atom, Str),
	!.
tokenized_sent_to_atom(TokenizedSent, _Atom) :-
	format('~N*** Error: unable to convert "~w" to string~n', [TokenizedSent]),
	fail.

tokenized_sent_to_string([], In-In).
tokenized_sent_to_string([F | R], In-Out) :-
	tokenized_sent_element_to_string(F, F1),
	append(F1, Next, In),
	!,
	tokenized_sent_to_string(R, Next-Out).

tokenized_sent_element_to_string(l(Atom0), Str) :-
	lowercase_atom(Atom0, Atom),
	atom_codes(Atom, Str),
	!.
tokenized_sent_element_to_string(Atom, Str) :-
	atom_codes(Atom, Str),
	!.

%---------------------------------------------------------------

collapse_multiwords([], []).
collapse_multiwords(In, Out) :-
	multiwords:multiword(In-InNext, Out-OutNext),
	!,
	collapse_multiwords(InNext, OutNext).
collapse_multiwords([F | R], [F | R1]) :-
	collapse_multiwords(R, R1).

%---------------------------------------------------------------

add_start_and_end_markers(TokenizedSent0, TokenizedSent) :-
	append(['*start*' | TokenizedSent0], ['*end*'], TokenizedSent),
	!.

remove_start_and_end_markers([], []).
remove_start_and_end_markers(['*start*', ' ' | R], R1) :-
	!,
	remove_start_and_end_markers(R, R1).
remove_start_and_end_markers([F | R], R1) :-
	member(F, ['*start*', '*end*']),
	!,
	remove_start_and_end_markers(R, R1).
remove_start_and_end_markers([F | R], [F | R1]) :-
	!,
	remove_start_and_end_markers(R, R1).

%---------------------------------------------------------------

tokenized_sent([]) -->
	[].
tokenized_sent([F | R]) -->
	tokenized_sent_element(F),
	!,
	tokenized_sent(R).

tokenized_sent_element(Atom) -->
	special_word(Atom),
	!.
tokenized_sent_element(Atom) -->
	alphanum_string(Atom),
	!.
tokenized_sent_element(Atom) -->
	punctuation_mark(Atom),
	!.
tokenized_sent_element(' ') -->
	whitespace,
	!.
% In case something goes wrong.
tokenized_sent_element('BAD'(RestAtom), Rest, []) :-
	atom_codes(RestAtom, Rest),
	!.

alphanum_string(Atom) -->
	alphanum_char_sequence(Str),
	{Str \== []},
	{atom_codes(Atom, Str)},
	!.

punctuation_mark(Atom) -->
	[Char],
	{punctuation_char(Char)},
	{atom_codes(Atom, [Char])}.

alphanum_char_sequence([F | R]) -->
	alphanum_char(F),
	!,
	alphanum_char_sequence(R).
alphanum_char_sequence([]) -->
	[].

alphanum_char(Char) -->
	[Char],
	{alphanum_char(Char)}.

digit_char(D) -->
	[D],
	{digit_char(D)}.

whitespace -->
	[Char],
	{whitespace_char(Char)},
	!,
	possible_empty_whitespace.

possible_empty_whitespace -->
	whitespace,
	!.
possible_empty_whitespace -->
	[].    

alphanum_char(Char) :-
	uppercase_char(Char),
	!.
alphanum_char(Char) :-
	lowercase_char(Char),
	!.
alphanum_char(Char) :-
	digit_char(Char),
	!.

punctuation_char(X) :-
	\+ alphanum_char(X),
	\+ whitespace_char(X).

special_word('m\'') -->
	"m'".
special_word('t\'') -->
	"t'".
special_word('s\'') -->
	"s'".
special_word('l\'') -->
	"l'".
special_word('qu\'') -->
	"qu'".
special_word('j\'') -->
	"j'".
special_word('c\'') -->
	"c'".
special_word('n\'') -->
	"n'".
special_word('M\'') -->
	"M'".
special_word('T\'') -->
	"T'".
special_word('S\'') -->
	"S'".
special_word('L\'') -->
	"L'".
special_word('Qu\'') -->
	"Qu'".
special_word('J\'') -->
	"J'".
special_word('C\'') -->
	"C'".
special_word('N\'') -->
	"N'".

/*
punctuation_char(0'.).
punctuation_char(0',).
punctuation_char(0':).
punctuation_char(0';).
punctuation_char(0',).
punctuation_char(0'-).
punctuation_char(0'$).
punctuation_char(0'!).
punctuation_char(0'().
punctuation_char(0')).
punctuation_char(0'?).
punctuation_char(0'\').
punctuation_char(0'").  %"
punctuation_char(0'%).  
punctuation_char(0'&).
*/