1:- module(tokenize,
    2          [ tokenize/2,
    3            tokenize/3,
    4            tokenize_file/2,
    5            tokenize_file/3,
    6            untokenize/2
    7          ]).

tokenize

This module offers a simple tokenizer with flexible options.

author
- Shon Feder
license
- http://unlicense.org/

Rational:

tokenize_atom/2, in library(porter_stem), is inflexible, in that it doesn't allow for the preservation of white space or control characters, and it only tokenizes into a list of atoms.

The tokenize library is meant to be easy to use while allowing for relatively flexible input and output. Features include

  • options for tokenization of spaces, numbers, strings, control characters and punctuation
  • options to output packed tokens
  • options to represent tokens in any of the common SWI-Prolog text formats
  • option to preserve or ignore case
  • a predicate to emit text given a list of tokens

E.g.,

?- tokenize('Tokenizes: words,"strings", 1234.5\n', Tokens, [cased(true), spaces(false)]),
|    untokenize(Tokens, Codes).
Tokens = [word('Tokenizes'), punct(:), word(words), punct(','), string(strings), punct(','), number(1234.5), cntrl('\n')],
Codes = "Tokenizes:words,"strings"...34.5
".

tokenize is much more limited and much less performant than a lexer generator, but it is dead simple to use and flexible enough for many common use cases. */

   45:- use_module(library(dcg/basics), [eos//0, number//1]).   46:- use_module(tokenize_opts).   47
   48% Ensure we interpret back ticks as enclosing code lists in this module.
   49:- set_prolog_flag(back_quotes, codes).
 tokenize(+Text:text, -Tokens:list(term)) is semidet
See also
- tokenize/3 when called with an empty list of options: thus, with defaults.
   55% TODO: add support for unicode
   56
   57tokenize(Text, Tokens) :-
   58    tokenize(Text, Tokens, []).
 tokenize(+Text:text, -Tokens:list(term), +Options:list(term)) is semidet
True when Tokens is unified with a list of tokens representing the text from Text, according to the options specified in Options.

Each token in Tokens will be one of:

word(W)
Where W is comprised of contiguous alpha-numeric chars.
punct(P)
Where char_type(P, punct).
cntrl(C)
Where char_type(C, cntrl).
space(S)
Where S == ' '.
number(N)
Where number(N).
string(S)
Where S was a sequence of bytes enclosed by double quotation marks.

Note that the above describes the default behavior, in which the token is represented as an atom. This representation can be changed by using the to option described below.

Valid Options are:

cased(+boolean)
Determines whether tokens perserve cases of the source text. Defaults to cased(false).
spaces(+boolean)
Determines whether spaces are represted as tokens or discarded. Defaults to spaces(true).
cntrl(+boolean)
Determines whether control characters are represented as tokens or discarded. Defaults to cntrl(true).
punct(+boolean)
Determines whether punctuation characters are represented as tokens or discarded. Defaults to punct(true).
numbers(+boolean)
Determines whether the tokenizer represents and tags numbers. Defaults to numbers(true).
strings(+boolean)
Determines whether the tokenizer represents and tags strings. Defaults to strings(true).
pack(+boolean)
Determines whether tokens are packed or repeated. Defaults to pack(false).
to(+one_of([strings, atoms, chars, codes]))
Determines the representation format used for the tokens. Defaults to to(atoms).
  103% TODO is it possible to achieve the proper semidet without the cut?
  104% Annie sez some parses are ambiguous, not even sure the cut should be
  105% there
  106
  107tokenize(Text, ProcessedTokens, Options) :-
  108    must_be(nonvar, Text),
  109    string_codes(Text, Codes),
  110    process_options(Options, PreOpts, TokenOpts, PostOpts),
  111    preprocess(PreOpts, Codes, ProcessedCodes),
  112    phrase(tokens(TokenOpts, Tokens), ProcessedCodes),
  113    postprocess(PostOpts, Tokens, ProcessedTokens),
  114    !.
  115
  116non_tokens([T])    --> T.
  117non_tokens([T|Ts]) --> T, non_tokens(Ts).
 tokenize_file(+File:atom, -Tokens:list(term)) is semidet
See also
- tokenize_file/3 when called with an empty list of options: thus, with defaults.
  124% Note: does not use phrase_from_file/3, thus not lazy or transparent
  125% This choice was made so that tokenize_file will work with remotely
  126% accessed files.
  127% TODO: make this configurable, so it can be used in the different modes
  128
  129% TODO: add more source options
  130
  131tokenize_file(File, Tokens) :-
  132    tokenize_file(File, Tokens, []).
 tokenize_file(+File:atom, -Tokens:list(term), +Options:list(term)) is semidet
True when Tokens is unified with a list of tokens represening the text of File.
See also
- tokenize/3 which has the same available options and behavior.
  141tokenize_file(File, Tokens, Options) :-
  142    read_file_to_codes(File, Codes, [encoding(utf8)]),
  143    tokenize(Codes, Tokens, Options).
 untokenize(+Tokens:list(term), -Untokens:list(codes)) is semidet
True when Untokens is unified with a code list representation of each token in Tokens.
  150% TODO structure(Options:[lines, brackets])
  151% TODO mode(generate) ; mode(parse)
  152% TODO add output format option
  153% TODO is it possible to achieve the proper semidet  without the cut?
  154
  155untokenize(Tokens, Untokens) :-
  156    untokenize(Tokens, Untokens, []).
  157untokenize(Tokens, Untokens, _Options) :-
  158    maplist(token_to(codes), Tokens, TokenCodes),
  159    phrase(non_tokens(TokenCodes), Untokens),
  160    !.
  161
  162/***********************************
  163*      {PRE,POST}-PROCESSING HELPERS      *
  164***********************************/
  165
  166preprocess(PreOpts, Codes, ProcessedCodes) :-
  167    preopts_data(cased, PreOpts, Cased),
  168    DCG_Rules = (
  169        preprocess_case(Cased)
  170    ),
  171    phrase(process_dcg_rules(DCG_Rules, ProcessedCodes), Codes).
  172
  173postprocess(PostOpts, Tokens, ProcessedTokens) :-
  174    postopts_data(spaces, PostOpts, Spaces),
  175    postopts_data(cntrl, PostOpts, Cntrl),
  176    postopts_data(punct, PostOpts, Punct),
  177    postopts_data(to, PostOpts, To),
  178    postopts_data(pack, PostOpts, Pack),
  179    DCG_Rules = (
  180        keep_token(space(_), Spaces),
  181        keep_token(cntrl(_), Cntrl),
  182        keep_token(punct(_), Punct),
  183        convert_token(To)
  184    ),
  185    phrase(process_dcg_rules(DCG_Rules, PrePackedTokens), Tokens),
  186    (Pack
  187    -> phrase(pack_tokens(ProcessedTokens), PrePackedTokens)
  188    ;  ProcessedTokens = PrePackedTokens
  189    ).
  190
  191
  192/***********************************
  193*      POSTPROCESSING HELPERS      *
  194***********************************/
  195
  196% Process a stream through a pipeline of DCG rules
  197process_dcg_rules(_, []) --> eos, !.
  198process_dcg_rules(DCG_Rules, []) --> DCG_Rules, eos, !.
  199process_dcg_rules(DCG_Rules, [C|Cs]) -->
  200    DCG_Rules,
  201    [C],
  202    process_dcg_rules(DCG_Rules, Cs).
  203
  204preprocess_case(true), [C] --> [C].
  205preprocess_case(false), [CodeOut] --> [CodeIn],
  206    { to_lower(CodeIn, CodeOut) }.
  207
  208keep_token(_, true), [T] --> [T].
  209keep_token(Token, false) --> [Token].
  210keep_token(Token, false), [T] --> [T], {T \= Token}.
  211
  212convert_token(Type), [Converted] --> [Token],
  213    {token_to(Type, Token, Converted)}.
  214
  215% Convert tokens to alternative representations.
  216token_to(_, number(X), number(X)) :- !.
  217token_to(Type, Token, Converted) :-
  218    ( Type == strings -> Conversion = inverse(string_codes)
  219    ; Type == atoms   -> Conversion = inverse(atom_codes)
  220    ; Type == chars   -> Conversion = inverse(string_chars)
  221    ; Type == codes   -> Conversion = string_codes
  222    ),
  223    call_into_term(Conversion, Token, Converted).
  224
  225% Packing repeating tokens
  226pack_tokens([T])    --> pack_token(T).
  227pack_tokens([T|Ts]) --> pack_token(T), pack_tokens(Ts).
  228
  229pack_token(P) --> pack(Token, N), {Token =.. [F,T], P =.. [F,T,N]}.
  230
  231pack(X, Count) --> [X], pack(X, 1, Count).
  232
  233pack(_, Total, Total)      --> eos.
  234pack(X, Total, Total), [Y] --> [Y], { Y \= X }.
  235pack(X, Count, Total)      --> [X], { succ(Count, NewCount) },
  236                               pack(X, NewCount, Total).
  237
  238
  239/**************************
  240*      TOKENIZATION       *
  241**************************/
  242
  243tokenize_text --> state(Text, Tokenized),
  244                  { phrase(tokens(Tokenized), Text) }.
  245
  246
  247% PARSING
  248
  249tokens(Opts, [T])    --> token(Opts, T), eos, !.
  250tokens(Opts, [T|Ts]) --> token(Opts, T), tokens(Opts, Ts).
  251
  252% NOTE for debugging
  253% tokens(_)   --> {length(L, 200)}, L, {format(L)}, halt, !.
  254
  255token(Opts, string(S)) -->
  256    { tokenopts_data(strings, Opts, true) },
  257    string(S).
  258
  259token(Opts, number(N)) -->
  260    { tokenopts_data(numbers, Opts, true) },
  261    number(N), !.
  262
  263token(_Opts, word(W))     --> word(W), eos, !.
  264token(_Opts, word(W)),` ` --> word(W), ` `.
  265token(_Opts, word(W)), C  --> word(W), (punct(C) ; cntrl(C) ; nasciis(C)).
  266
  267token(_Opts, space(S))   --> space(S).
  268token(_Opts, punct(P)) --> punct(P).
  269token(_Opts, cntrl(C)) --> cntrl(C).
  270token(_Opts, other(O)) --> nasciis(O).
  271
  272space(` `) --> ` `.
  273
  274sep --> ' '.
  275sep --> eos, !.
  276
  277word(W) --> csyms(W).
  278
  279% TODO Make open and close brackets configurable
  280string(S) --> string(`"`, `"`, S).
  281string(OpenBracket, CloseBracket, S) --> string_start(OpenBracket, CloseBracket, S).
  282
  283% A string starts when we encounter an OpenBracket
  284string_start(OpenBracket, CloseBracket, Cs) -->
  285    OpenBracket, string_content(OpenBracket, CloseBracket, Cs).
  286
  287% String content is everything up until we hit a CloseBracket
  288string_content(_OpenBracket, CloseBracket, []) --> CloseBracket, !.
  289% String content includes a bracket following an escape, but not the escape
  290string_content(OpenBracket, CloseBracket, [C|Cs]) -->
  291    escape, (CloseBracket | OpenBracket),
  292    {[C] = CloseBracket},
  293    string_content(OpenBracket, CloseBracket, Cs).
  294% String content includes any character that isn't a CloseBracket or an escape.
  295string_content(OpenBracket, CloseBracket, [C|Cs]) -->
  296    [C],
  297    {[C] \= CloseBracket},
  298    string_content(OpenBracket, CloseBracket, Cs).
  299
  300csyms([L])    --> csym(L).
  301csyms([L|Ls]) --> csym(L), csyms(Ls).
  302
  303csym(L)       --> [L], {code_type(L, csym)}.
  304
  305
  306% non ascii's
  307nasciis([C])     --> nascii(C), eos, !.
  308nasciis([C]),[D] --> nascii(C), [D], {D < 127}.
  309nasciis([C|Cs])  --> nascii(C), nasciis(Cs).
  310
  311nascii(C)        --> [C], {C > 127}.
  312
  313' ' --> space.
  314' ' --> space, ' '.
  315
  316escape --> `\\`.
  317
  318% Any
  319... --> [].
  320... --> [_], ... .
  321
  322space --> [S], {code_type(S, white)}.
  323
  324punct([P]) --> [P], {code_type(P, punct)}.
  325cntrl([C]) --> [C], {code_type(C, cntrl)}.
  326
  327% TODO move to general module
  328
  329codes_to_lower([], []).
  330codes_to_lower([U|Uppers], [L|Lowers]) :-
  331    code_type(U, to_upper(L)),
  332    codes_to_lower(Uppers, Lowers).
  333
  334call_into_term(P, Term, Result) :-
  335    Term =.. [F, Arg],
  336    call(P, Arg, ResultArg),
  337    Result =.. [F, ResultArg].
  338
  339inverse(P, A, B) :-
  340    call(P, B, A).
  341
  342pad(T_Args, X, T_X_Args) :-
  343    T_Args   =.. [T|Args],
  344    T_X_Args =.. [T, X| Args]