1% This file is part of the Attempto Parsing Engine (APE). 2% Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). 3% 4% The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it 5% under the terms of the GNU Lesser General Public License as published by the Free Software 6% Foundation, either version 3 of the License, or (at your option) any later version. 7% 8% The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT 9% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 10% PURPOSE. See the GNU Lesser General Public License for more details. 11% 12% You should have received a copy of the GNU Lesser General Public License along with the Attempto 13% Parsing Engine (APE). If not, see http://www.gnu.org/licenses/. 14 15:- module(tokenizer, [ 16 tokenize/2 17 ]). 18:- use_module(library(error)). 19 20:- use_module('../lexicon/chars', [ 21 is_letter/1, 22 is_digit/1 23 ]). 24 25:- use_module('../logger/error_logger', [ 26 add_error_message_once/4, 27 add_warning_message_once/4 28 ]). 29 30:- use_module('../utils/ace_niceace', [ 31 pronoun_split/2 32 ]).
Tokenization will never fail. In case something goes wrong (e.g. a string or comment is not closed) then an error message is asserted.
84tokenize([], []) :- !. 85 86tokenize(Text, Tokens) :- 87 ( atom(Text) 88 -> atom_codes(Text, Codes) 89 ; string(Text) 90 -> string_codes(Text, Codes) 91 ), 92 !, 93 codes_to_tokens(Codes, Tokens). 94 95tokenize(Codes, Tokens) :- 96 must_be(codes, Codes), 97 codes_to_tokens(Codes, Tokens).
111codes_to_tokens(Codes, Tokens) :-
112 get_atomics(Codes, Atomics),
113 expand_contracted_forms(Atomics, Tokens).
121get_atomics([], []). 122 123% A paragraph break starts with \n followed by whitespace, 124% including at least one more \n. 125% If another character is encountered first then the paragraph is not broken. 126get_atomics([10 | Cs], AllTokens) :- 127 get_whitespace_and_comments(Cs, Remaining, Newline_Count), 128 !, 129 add_paragraph_break_symbol(Newline_Count, Ts, AllTokens), 130 get_atomics(Remaining, Ts). 131 132% Quoted string starts with " (34) 133get_atomics([34 | Cs], AllTokens) :- 134 !, 135 get_string(Cs, Prefix, Remaining), 136 string_to_token(Prefix, Ts, AllTokens), 137 get_atomics(Remaining, Ts). 138 139% Quoted word starts with ` (96) 140get_atomics([96 | Cs], [T | Ts]) :- 141 !, 142 get_qword(Cs, Prefix, Remaining), 143 atom_codes(T, Prefix), 144 get_atomics(Remaining, Ts). 145 146% Perl-style comment starts with # (35) 147get_atomics([35 | Cs], Ts) :- 148 !, 149 get_perl_comment(Cs, _Prefix, Remaining), 150 get_atomics(Remaining, Ts). 151 152% C-style comment starts with /* (47, 42) 153get_atomics([47, 42 | Cs], Ts) :- 154 !, 155 get_c_comment(Cs, _Prefix, Remaining), 156 get_atomics(Remaining, Ts). 157 158% Whitespace (\t, ' ', \r) is ignored (whitespace does not include \n) 159get_atomics([C | Cs], Ts) :- 160 is_whitespace(C), 161 !, 162 get_atomics(Cs, Ts). 163 164% Positive number starts with a digit 165get_atomics([C | Cs], [Number | Ts]) :- 166 is_digit(C), 167 !, 168 get_number(Cs, Prefix, Remaining), 169 number_codes(Number, [C | Prefix]), 170 get_atomics(Remaining, Ts). 171 172% Negative number starts with a hyphen (45) and then a digit, 173% e.g. numbers like `-.5' are not allowed. 174get_atomics([45, C | Cs], ['-', Number | Ts]) :- 175 is_digit(C), 176 !, 177 get_number(Cs, Prefix, Remaining), 178 number_codes(Number, [C | Prefix]), 179 get_atomics(Remaining, Ts). 180 181% Word starts (see is_word_char/1) 182get_atomics([C | Cs], AllTokens) :- 183 is_word_char(C), 184 !, 185 get_word(Cs, Prefix, Remaining), 186 split_token([C | Prefix], Ts, AllTokens), 187 get_atomics(Remaining, Ts). 188 189% Special character maps to a one-character token 190get_atomics([C | Cs], [T | Ts]) :- 191 is_special(C, T), 192 !, 193 get_atomics(Cs, Ts). 194 195% All other characters (e.g. Japanese) are ignored, with a warning message. 196get_atomics([C | Cs], Ts) :- 197 with_output_to(atom(CharCode), format("~c (0x~16r, ~10r)", [C, C, C])), 198 add_warning_message_once(character, '', CharCode, 'Unknown character(s) ignored.'), 199 get_atomics(Cs, Ts).
207split_token(Codes, Ts, FinalTokens) :-
208 atom_codes(A, Codes),
209 (
210 pronoun_split(A, (A1, A2))
211 ->
212 FinalTokens = [A1, A2 | Ts]
213 ;
214 FinalTokens = [A | Ts]
215 ).
222is_whitespace(32). % ' ' 223is_whitespace(9). % \t 224is_whitespace(13). % \r
232is_word_char(45). 233 234% _ (underscore) 235is_word_char(95). 236 237% $ (dollar) 238is_word_char(36). 239 240% degree sign 241is_word_char(176). 242 243% letters 244is_word_char(Code) :- 245 is_letter(Code).
255% period, question mark, exclamation mark 256is_special(46, '.'). 257is_special(63, '?'). 258is_special(33, '!'). 259 260% hyphen 261is_special(45, '-'). 262 263% colon (for prefixed words' support) 264is_special(58, ':'). 265 266% apostroph (for Saxon genitive support) 267is_special(39, '\''). 268 269% slash (for him/her support) 270is_special(47, '/'). 271 272% comma (for comma-and, comma-or support) 273is_special(44, ','). 274 275% plus sign 276is_special(43, '+'). 277 278% exponentiation sign 279is_special(94, '^'). 280 281% star 282is_special(42, '*'). 283 284% parentheses () 285is_special(40, '('). 286is_special(41, ')'). 287 288% square brackets [] 289is_special(91, '['). 290is_special(93, ']'). 291 292% curly bracktes {} 293is_special(123, '{'). 294is_special(125, '}'). 295 296% < = > \ 297is_special(60, '<'). 298is_special(61, '='). 299is_special(62, '>'). 300is_special(92, '\\'). 301 302% ampersand 303is_special(38, '&').
313get_string([], [], []) :- 314 add_error_message_once(character, '', 'EOF', 'Every quoted string must end with ".'). 315 316get_string([34 | Cs], [], Cs) :- !. 317 318get_string([92, C | Cs], [C | Prefix], Remaining) :- 319 !, 320 get_string(Cs, Prefix, Remaining). 321 322get_string([C | Cs], [C | Prefix], Remaining) :- 323 get_string(Cs, Prefix, Remaining).
333get_qword([], [], []) :- 334 add_error_message_once(character, '', 'EOF', 'Every quoted word must end with `.'). 335 336get_qword([96 | Cs], [], Cs) :- !. 337 338get_qword([92, C | Cs], [C | Prefix], Remaining) :- 339 !, 340 get_qword(Cs, Prefix, Remaining). 341 342get_qword([C | Cs], [C | Prefix], Remaining) :- 343 get_qword(Cs, Prefix, Remaining).
351get_word([], [], []). 352 353get_word([C | Cs], [C | Prefix], Remaining) :- 354 is_word_char(C), 355 !, 356 get_word(Cs, Prefix, Remaining). 357 358get_word([C | Cs], [C | Prefix], Remaining) :- 359 is_digit(C), 360 !, 361 get_word(Cs, Prefix, Remaining). 362 363get_word([C | Cs], [], [C | Cs]).
372get_number(Codes, Prefix, RemainingCodes) :- 373 get_number_x(Codes, zero, Prefix, RemainingCodes). 374 375 376get_number_x([], _, [], []). 377 378get_number_x([46, C | Cs], zero, [46, C | Prefix], Remaining) :- 379 is_digit(C), 380 !, 381 get_number_x(Cs, more_than_zero, Prefix, Remaining). 382 383get_number_x([C | Cs], Dot_Count, [C | Prefix], Remaining) :- 384 is_digit(C), 385 !, 386 get_number_x(Cs, Dot_Count, Prefix, Remaining). 387 388get_number_x([C | Cs], _, [], [C | Cs]).
395get_perl_comment([], [], []) :- 396 add_error_message_once(character, '', 'EOF', 'Every #-comment must end with the newline.'). 397 398get_perl_comment([10 | Cs], [], [10 | Cs]) :- 399 !. 400 401get_perl_comment([C | Cs], [C | Prefix], Remaining) :- 402 get_perl_comment(Cs, Prefix, Remaining).
409get_c_comment([], [], []) :- 410 add_error_message_once(character, '', 'EOF', 'Every /*-comment must end with */.'). 411 412get_c_comment([42, 47 | Cs], [], Cs) :- 413 !. 414 415get_c_comment([C | Cs], [C | Prefix], Remaining) :- 416 get_c_comment(Cs, Prefix, Remaining).
BUG: I'd rather prefer a representation such as
string('')
or string(content)
.
427string_to_token(Prefix, Ts, [T | Ts]) :-
428 atom_codes(S, Prefix),
429 concat_atom(['"', S, '"'], T).
434add_paragraph_break_symbol(at_least_two, Ts, ['<p>' | Ts]) :- !. 435add_paragraph_break_symbol(_, Ts, Ts).
442get_whitespace_and_comments(Cs, Remaining, Newline_Count) :- 443 get_whitespace_and_comments(Cs, one, Remaining, Newline_Count). 444 445get_whitespace_and_comments([], Newline_Count, [], Newline_Count). 446 447get_whitespace_and_comments([10 | Cs], _, Remaining, Newline_Count) :- 448 !, 449 get_whitespace_and_comments(Cs, at_least_two, Remaining, Newline_Count). 450 451get_whitespace_and_comments([C | Cs], Newline_Count, Remaining, Final_Newline_Count) :- 452 is_whitespace(C), 453 !, 454 get_whitespace_and_comments(Cs, Newline_Count, Remaining, Final_Newline_Count). 455 456/* 457% Perl-style comment starts 458get_whitespace_and_comments([35 | Cs], Newline_Count, Final_Remaining, Final_Newline_Count) :- 459 !, 460 get_perl_comment(Cs, _Prefix, Remaining), 461 get_whitespace_and_comments(Remaining, Newline_Count, Final_Remaining, Final_Newline_Count). 462 463% C-style comment starts 464get_whitespace_and_comments([47, 42 | Cs], Newline_Count, Final_Remaining, Final_Newline_Count) :- 465 !, 466 get_c_comment(Cs, _Prefix, Remaining), 467 get_whitespace_and_comments(Remaining, Newline_Count, Final_Remaining, Final_Newline_Count). 468*/ 469 470get_whitespace_and_comments([C | Cs], Newline_Count, [C | Cs], Newline_Count).
477expand_contracted_forms([], []). 478 479expand_contracted_forms(['No', one | RestIn], ['No', '-one' | RestOut]) :- 480 !, 481 expand_contracted_forms(RestIn, RestOut). 482 483expand_contracted_forms([no, one | RestIn], [no, '-one' | RestOut]) :- 484 !, 485 expand_contracted_forms(RestIn, RestOut). 486 487expand_contracted_forms([isn, '\'', t | RestIn], [is, not | RestOut]) :- 488 !, 489 expand_contracted_forms(RestIn, RestOut). 490 491expand_contracted_forms([aren, '\'', t | RestIn], [are, not | RestOut]) :- 492 !, 493 expand_contracted_forms(RestIn, RestOut). 494 495expand_contracted_forms([doesn, '\'', t | RestIn], [does, not | RestOut]) :- 496 !, 497 expand_contracted_forms(RestIn, RestOut). 498 499expand_contracted_forms([don, '\'', t | RestIn], [do, not | RestOut]) :- 500 !, 501 expand_contracted_forms(RestIn, RestOut). 502 503expand_contracted_forms([can, '\'', t | RestIn], [can, not | RestOut]) :- 504 !, 505 expand_contracted_forms(RestIn, RestOut). 506 507expand_contracted_forms([cannot | RestIn], [can, not | RestOut]) :- 508 !, 509 expand_contracted_forms(RestIn, RestOut). 510 511expand_contracted_forms([shouldn, '\'', t | RestIn], [should, not | RestOut]) :- 512 !, 513 expand_contracted_forms(RestIn, RestOut). 514 515expand_contracted_forms([Token | TailIn], [Token | TailOut]) :- 516 expand_contracted_forms(TailIn, TailOut)
APE Tokenizer
Comments:
string(content)
, this would probably also fix the buggy handling of `Every dot is ".".'.BUG: future work: add character counting to be able to report character offsets together with every token.
Example:
?- tokenizer:
codes_to_tokens("2men;can't like\"#@\"and:everything@.", T)
,writeq(T)
. [2, men, can, not, like, '"#@"', and, :, every, '-thing', '.']*/