1% This file is part of the Attempto Parsing Engine (APE).
    2% Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch).
    3%
    4% The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it
    5% under the terms of the GNU Lesser General Public License as published by the Free Software
    6% Foundation, either version 3 of the License, or (at your option) any later version.
    7%
    8% The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT
    9% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
   10% PURPOSE. See the GNU Lesser General Public License for more details.
   11%
   12% You should have received a copy of the GNU Lesser General Public License along with the Attempto
   13% Parsing Engine (APE). If not, see http://www.gnu.org/licenses/.
   14
   15:- module(tokenizer, [
   16		tokenize/2
   17	]).   18:- use_module(library(error)).   19
   20:- use_module('../lexicon/chars', [
   21		is_letter/1,
   22		is_digit/1
   23	]).   24
   25:- use_module('../logger/error_logger', [
   26		add_error_message_once/4,
   27		add_warning_message_once/4
   28	]).   29
   30:- use_module('../utils/ace_niceace', [
   31        pronoun_split/2
   32    ]).

APE Tokenizer

author
- Kaarel Kaljurand
- Tobias Kuhn
version
- 2010-03-28

Comments:

  • Strings (between double quotes) are tokenized into [", content, "]. This can be misleading, e.g. in the sentence: `"1" represents 1.' the verb `represents' is reported as the 4th token. One could instead produce the term string(content), this would probably also fix the buggy handling of `Every dot is ".".'.
  • Saxon Genitives are tokenized as [noun, ', s] and [nouns, ']
  • Digits cannot start a token that contains other symbols than digits and dots. Consider the following input string and its tokenization.
    123man man123 123man123 man123man
    
    [123, man, man123, 123, man123, man123man]

    BUG: future work: add character counting to be able to report character offsets together with every token.

    Example:

    ?- tokenizer:codes_to_tokens("2men;can't like\"#@\"and:everything@.", T), writeq(T). [2, men, can, not, like, '"#@"', and, :, every, '-thing', '.']

    */

 tokenize(+ACEText:term, -Tokens:list) is det
Breaks the ACEText (either an atom, or a list of character codes) into a list of tokens (atoms or numbers). The input ACEText can either be an atom like 'this is an example' or a list of character codes like [116,104,105,115,32,105,...] (possibly written as "this is an example").

Tokenization will never fail. In case something goes wrong (e.g. a string or comment is not closed) then an error message is asserted.

Arguments:
ACEText- is the input text, it is either an atom or a string
Tokens- is a list of tokens, i.e. the tokenization of the input text
   84tokenize([], []) :- !.
   85
   86tokenize(Text, Tokens) :-
   87	(   atom(Text)
   88	->  atom_codes(Text, Codes)
   89	;   string(Text)
   90	->  string_codes(Text, Codes)
   91	),
   92	!,
   93	codes_to_tokens(Codes, Tokens).
   94
   95tokenize(Codes, Tokens) :-
   96	must_be(codes, Codes),
   97	codes_to_tokens(Codes, Tokens).
 codes_to_tokens(+Codes:list, -Tokens:list) is det
Maps the given list of character codes into a list of tokens. Performs two steps:
  1. Merges certain code sequences into single tokens (atoms or numbers).
  2. Modifies certain token sequences, e.g. [can, ', t] -> [can, not]. @param Codes is a list of character codes @param Tokens is a list of tokens, i.e. the tokenization of the input text
  111codes_to_tokens(Codes, Tokens) :-
  112	get_atomics(Codes, Atomics),
  113	expand_contracted_forms(Atomics, Tokens).
 get_atomics(+Codes:list, -Tokens:list) is det
BUG: We could also preserve the comment in the token list to be able to show better error messages.
  121get_atomics([], []).
  122
  123% A paragraph break starts with \n followed by whitespace,
  124% including at least one more \n.
  125% If another character is encountered first then the paragraph is not broken.
  126get_atomics([10 | Cs], AllTokens) :-
  127	get_whitespace_and_comments(Cs, Remaining, Newline_Count),
  128	!,
  129	add_paragraph_break_symbol(Newline_Count, Ts, AllTokens),
  130	get_atomics(Remaining, Ts).
  131
  132% Quoted string starts with " (34)
  133get_atomics([34 | Cs], AllTokens) :-
  134	!,
  135	get_string(Cs, Prefix, Remaining),
  136	string_to_token(Prefix, Ts, AllTokens),
  137	get_atomics(Remaining, Ts).
  138
  139% Quoted word starts with ` (96)
  140get_atomics([96 | Cs], [T | Ts]) :-
  141	!,
  142	get_qword(Cs, Prefix, Remaining),
  143	atom_codes(T, Prefix),
  144	get_atomics(Remaining, Ts).
  145
  146% Perl-style comment starts with # (35)
  147get_atomics([35 | Cs], Ts) :-
  148	!,
  149	get_perl_comment(Cs, _Prefix, Remaining),
  150	get_atomics(Remaining, Ts).
  151
  152% C-style comment starts with /* (47, 42)
  153get_atomics([47, 42 | Cs], Ts) :-
  154	!,
  155	get_c_comment(Cs, _Prefix, Remaining),
  156	get_atomics(Remaining, Ts).
  157
  158% Whitespace (\t, ' ', \r) is ignored (whitespace does not include \n)
  159get_atomics([C | Cs], Ts) :-
  160	is_whitespace(C),
  161	!,
  162	get_atomics(Cs, Ts).
  163
  164% Positive number starts with a digit
  165get_atomics([C | Cs], [Number | Ts]) :-
  166	is_digit(C),
  167	!,
  168	get_number(Cs, Prefix, Remaining),
  169	number_codes(Number, [C | Prefix]),
  170	get_atomics(Remaining, Ts).
  171
  172% Negative number starts with a hyphen (45) and then a digit,
  173% e.g. numbers like `-.5' are not allowed.
  174get_atomics([45, C | Cs], ['-', Number | Ts]) :-
  175	is_digit(C),
  176	!,
  177	get_number(Cs, Prefix, Remaining),
  178	number_codes(Number, [C | Prefix]),
  179	get_atomics(Remaining, Ts).
  180
  181% Word starts (see is_word_char/1)
  182get_atomics([C | Cs], AllTokens) :-
  183	is_word_char(C),
  184	!,
  185	get_word(Cs, Prefix, Remaining),
  186	split_token([C | Prefix], Ts, AllTokens),
  187	get_atomics(Remaining, Ts).
  188
  189% Special character maps to a one-character token
  190get_atomics([C | Cs], [T | Ts]) :-
  191	is_special(C, T),
  192	!,
  193	get_atomics(Cs, Ts).
  194
  195% All other characters (e.g. Japanese) are ignored, with a warning message.
  196get_atomics([C | Cs], Ts) :-
  197	with_output_to(atom(CharCode), format("~c (0x~16r, ~10r)", [C, C, C])),
  198	add_warning_message_once(character, '', CharCode, 'Unknown character(s) ignored.'),
  199	get_atomics(Cs, Ts).
 split_token(+Codes:list, +Ts:list, -FinalTokens:list) is det
Builds a token (atom) and splits it into two if needed, e.g. 'Everything' is split into 'Every' and '-thing'.
  207split_token(Codes, Ts, FinalTokens) :-
  208	atom_codes(A, Codes),
  209	(
  210		pronoun_split(A, (A1, A2))
  211	->
  212		FinalTokens = [A1, A2 | Ts]
  213	;
  214		FinalTokens = [A | Ts]
  215	).
 is_whitespace(?Code)
Note that newlines are handled elsewhere.
  222is_whitespace(32). % ' '
  223is_whitespace(9).  % \t
  224is_whitespace(13). % \r
 is_word_char(?Code)
Characters that are allowed in ACE words.
  232is_word_char(45).
  233
  234% _ (underscore)
  235is_word_char(95).
  236
  237% $ (dollar)
  238is_word_char(36).
  239
  240% degree sign
  241is_word_char(176).
  242
  243% letters
  244is_word_char(Code) :-
  245	is_letter(Code).
 is_special(?Code, ?Atom) is det
Characters that translate into one-character atoms.
Arguments:
Code- is character code
Atom- is a one-character atom that corresponds to the code
  255% period, question mark, exclamation mark
  256is_special(46, '.').
  257is_special(63, '?').
  258is_special(33, '!').
  259
  260% hyphen
  261is_special(45, '-').
  262
  263% colon (for prefixed words' support)
  264is_special(58, ':').
  265
  266% apostroph (for Saxon genitive support)
  267is_special(39, '\'').
  268
  269% slash (for him/her support)
  270is_special(47, '/').
  271
  272% comma (for comma-and, comma-or support)
  273is_special(44, ',').
  274
  275% plus sign
  276is_special(43, '+').
  277
  278% exponentiation sign
  279is_special(94, '^').
  280
  281% star
  282is_special(42, '*').
  283
  284% parentheses ()
  285is_special(40, '(').
  286is_special(41, ')').
  287
  288% square brackets []
  289is_special(91, '[').
  290is_special(93, ']').
  291
  292% curly bracktes {}
  293is_special(123, '{').
  294is_special(125, '}').
  295
  296% < = > \
  297is_special(60, '<').
  298is_special(61, '=').
  299is_special(62, '>').
  300is_special(92, '\\').
  301
  302% ampersand
  303is_special(38, '&').
 get_string(+Codes:list, -Prefix:list, -RemainingCodes:list) is det
Consumes the sequence of characters within a quoted string, as well as the closing quotation mark. The backslash (\, 92) can be used to escape the following character, e.g. the quotation mark or a backslash.
  313get_string([], [], []) :-
  314	add_error_message_once(character, '', 'EOF', 'Every quoted string must end with ".').
  315
  316get_string([34 | Cs], [], Cs) :- !.
  317
  318get_string([92, C | Cs], [C | Prefix], Remaining) :-
  319	!,
  320	get_string(Cs, Prefix, Remaining).
  321
  322get_string([C | Cs], [C | Prefix], Remaining) :-
  323	get_string(Cs, Prefix, Remaining).
 get_qword(+Codes:list, -Prefix:list, -RemainingCodes:list) is det
Consumes the sequence of characters within a quoted word, as well as the closing backtick. The backslash (\, 92) can be used to escape the following character, e.g. the quotation mark or a backslash.
  333get_qword([], [], []) :-
  334	add_error_message_once(character, '', 'EOF', 'Every quoted word must end with `.').
  335
  336get_qword([96 | Cs], [], Cs) :- !.
  337
  338get_qword([92, C | Cs], [C | Prefix], Remaining) :-
  339	!,
  340	get_qword(Cs, Prefix, Remaining).
  341
  342get_qword([C | Cs], [C | Prefix], Remaining) :-
  343	get_qword(Cs, Prefix, Remaining).
 get_word(+Codes:list, -Prefix:list, -RemainingCodes:list) is det
Consumes the word consisting of word characters and/or digits.
  351get_word([], [], []).
  352
  353get_word([C | Cs], [C | Prefix], Remaining) :-
  354	is_word_char(C),
  355	!,
  356	get_word(Cs, Prefix, Remaining).
  357
  358get_word([C | Cs], [C | Prefix], Remaining) :-
  359	is_digit(C),
  360	!,
  361	get_word(Cs, Prefix, Remaining).
  362
  363get_word([C | Cs], [], [C | Cs]).
 get_number(+Codes:list, -Prefix:list, -RemainingCodes:list) is det
Consumes a number which is a sequence of digits containing at most one dot (46). The dot (if present) must be followed by a digit.
  372get_number(Codes, Prefix, RemainingCodes) :-
  373	get_number_x(Codes, zero, Prefix, RemainingCodes).
  374
  375
  376get_number_x([], _, [], []).
  377
  378get_number_x([46, C | Cs], zero, [46, C | Prefix], Remaining) :-
  379	is_digit(C),
  380	!,
  381	get_number_x(Cs, more_than_zero, Prefix, Remaining).
  382
  383get_number_x([C | Cs], Dot_Count, [C | Prefix], Remaining) :-
  384	is_digit(C),
  385	!,
  386	get_number_x(Cs, Dot_Count, Prefix, Remaining).
  387
  388get_number_x([C | Cs], _, [], [C | Cs]).
 get_perl_comment(+Codes:list, -Prefix:list, -RemainingCodes:list) is det
Consumes the Perl-style comment excluding the final newline.
  395get_perl_comment([], [], []) :-
  396	add_error_message_once(character, '', 'EOF', 'Every #-comment must end with the newline.').
  397
  398get_perl_comment([10 | Cs], [], [10 | Cs]) :-
  399	!.
  400
  401get_perl_comment([C | Cs], [C | Prefix], Remaining) :-
  402	get_perl_comment(Cs, Prefix, Remaining).
 get_c_comment(+Codes:list, -Prefix:list, -RemainingCodes:list) is det
Consumes the C-style comment including the final */.
  409get_c_comment([], [], []) :-
  410	add_error_message_once(character, '', 'EOF', 'Every /*-comment must end with */.').
  411
  412get_c_comment([42, 47 | Cs], [], Cs) :-
  413	!.
  414
  415get_c_comment([C | Cs], [C | Prefix], Remaining) :-
  416	get_c_comment(Cs, Prefix, Remaining).
 string_to_token
The content of the string is bordered by quotation marks in the token list.

BUG: I'd rather prefer a representation such as string('') or string(content).

  427string_to_token(Prefix, Ts, [T | Ts]) :-
  428	atom_codes(S, Prefix),
  429	concat_atom(['"', S, '"'], T).
 add_paragraph_break_symbol
  434add_paragraph_break_symbol(at_least_two, Ts, ['<p>' | Ts]) :- !.
  435add_paragraph_break_symbol(_, Ts, Ts).
 get_whitespace_and_comments
Consuming whitespace and counting the newlines.
  442get_whitespace_and_comments(Cs, Remaining, Newline_Count) :-
  443	get_whitespace_and_comments(Cs, one, Remaining, Newline_Count).
  444
  445get_whitespace_and_comments([], Newline_Count, [], Newline_Count).
  446
  447get_whitespace_and_comments([10 | Cs], _, Remaining, Newline_Count) :-
  448	!,
  449	get_whitespace_and_comments(Cs, at_least_two, Remaining, Newline_Count).
  450
  451get_whitespace_and_comments([C | Cs], Newline_Count, Remaining, Final_Newline_Count) :-
  452	is_whitespace(C),
  453	!,
  454	get_whitespace_and_comments(Cs, Newline_Count, Remaining, Final_Newline_Count).
  455
  456/*
  457% Perl-style comment starts
  458get_whitespace_and_comments([35 | Cs], Newline_Count, Final_Remaining, Final_Newline_Count) :-
  459	!,
  460	get_perl_comment(Cs, _Prefix, Remaining),
  461	get_whitespace_and_comments(Remaining, Newline_Count, Final_Remaining, Final_Newline_Count).
  462
  463% C-style comment starts
  464get_whitespace_and_comments([47, 42 | Cs], Newline_Count, Final_Remaining, Final_Newline_Count) :-
  465	!,
  466	get_c_comment(Cs, _Prefix, Remaining),
  467	get_whitespace_and_comments(Remaining, Newline_Count, Final_Remaining, Final_Newline_Count).
  468*/
  469
  470get_whitespace_and_comments([C | Cs], Newline_Count, [C | Cs], Newline_Count).
 expand_contracted_forms(+TokenListIn:list, -TokenListOut:list) is det
bug
- : `cannot' could be instead handled during (pronoun) splitting
  477expand_contracted_forms([], []).
  478
  479expand_contracted_forms(['No', one | RestIn], ['No', '-one' | RestOut]) :-
  480	!,
  481	expand_contracted_forms(RestIn, RestOut).
  482
  483expand_contracted_forms([no, one | RestIn], [no, '-one' | RestOut]) :-
  484	!,
  485	expand_contracted_forms(RestIn, RestOut).
  486
  487expand_contracted_forms([isn, '\'', t | RestIn], [is, not | RestOut]) :-
  488	!,
  489	expand_contracted_forms(RestIn, RestOut).
  490
  491expand_contracted_forms([aren, '\'', t | RestIn], [are, not | RestOut]) :-
  492	!,
  493	expand_contracted_forms(RestIn, RestOut).
  494
  495expand_contracted_forms([doesn, '\'', t | RestIn], [does, not | RestOut]) :-
  496	!,
  497	expand_contracted_forms(RestIn, RestOut).
  498
  499expand_contracted_forms([don, '\'', t | RestIn], [do, not | RestOut]) :-
  500	!,
  501	expand_contracted_forms(RestIn, RestOut).
  502
  503expand_contracted_forms([can, '\'', t | RestIn], [can, not | RestOut]) :-
  504	!,
  505	expand_contracted_forms(RestIn, RestOut).
  506
  507expand_contracted_forms([cannot | RestIn], [can, not | RestOut]) :-
  508	!,
  509	expand_contracted_forms(RestIn, RestOut).
  510
  511expand_contracted_forms([shouldn, '\'', t | RestIn], [should, not | RestOut]) :-
  512	!,
  513	expand_contracted_forms(RestIn, RestOut).
  514
  515expand_contracted_forms([Token | TailIn], [Token | TailOut]) :-
  516	expand_contracted_forms(TailIn, TailOut)